jupiter_rs/infograph/
symbols.rs

1//! Provides a `SymbolTable` and a `SymbolMap` for an efficient and compact representation
2//! of objects / inner maps.
3//!
4//! As most datasets consist of many objects which all share the same keys, we keep the actual
5//! strings in a symbol table and only store the symbols in the actual objects.
6//!
7//! This heavily reduces the memory being consumed and also permits a very cache friendly and there-
8//! fore super fast lookup for values in inner objects.
9use std::cmp::Ordering;
10use std::collections::HashMap;
11use std::slice::Iter;
12
13/// Defines the representation of a symbol.
14///
15/// Note that a single `Doc` (and its `SymbolTable') may only store up to 2^31 - 1 distinct symbols.
16pub type Symbol = i32;
17
18/// Used to resolve and lookup symbols.
19pub struct SymbolTable {
20    table: HashMap<String, Symbol>,
21    symbols: Vec<String>,
22}
23
24impl SymbolTable {
25    /// Creates a new and empty symbol table.
26    ///
27    /// # Example
28    ///
29    /// ```
30    /// # use jupiter::infograph::symbols::SymbolTable;
31    /// let table = SymbolTable::new();
32    ///
33    /// assert_eq!(table.len(), 0);
34    /// ```
35    pub fn new() -> Self {
36        SymbolTable {
37            table: HashMap::new(),
38            symbols: Vec::new(),
39        }
40    }
41
42    /// Tries to resolve the given `string` into an existing `Symbol`.
43    ///
44    /// If no symbol with the given name is known, `None` is returned. If a new
45    /// symbol should be created instead, use `find_or_create`.
46    ///
47    /// # Example
48    /// ```
49    /// # use jupiter::infograph::symbols::SymbolTable;
50    /// let mut table = SymbolTable::new();
51    ///
52    /// let symbol = table.find_or_create("Test").unwrap();
53    ///
54    /// assert_eq!(table.resolve("Test").unwrap(), symbol);
55    /// assert_eq!(table.resolve("Unknown").is_none(), true);
56    /// ```
57    pub fn resolve(&self, string: impl AsRef<str>) -> Option<Symbol> {
58        let value = string.as_ref();
59        if let Some(symbol) = self.table.get(value) {
60            Some(*symbol)
61        } else {
62            None
63        }
64    }
65
66    /// Resolve the given `string` into a new or an existing `Symbol`.
67    ///
68    /// If no symbol should be created if the given name is unknown, use `resolve`.
69    ///
70    /// # Errors
71    ///
72    /// This will return an error if the internal symbol table overflows (if there are more than
73    /// std::i32::MAX - 2 symbols).
74    ///
75    /// # Example
76    /// ```
77    /// # use jupiter::infograph::symbols::SymbolTable;
78    /// let mut table = SymbolTable::new();
79    ///
80    /// let symbol = table.find_or_create("Test").unwrap();
81    /// assert_eq!(table.resolve("Test").unwrap(), symbol);
82    /// ```
83    pub fn find_or_create(&mut self, string: impl AsRef<str>) -> anyhow::Result<Symbol> {
84        let value = string.as_ref();
85        if let Some(symbol) = self.table.get(value) {
86            Ok(*symbol)
87        } else {
88            if self.symbols.len() >= std::i32::MAX as usize {
89                Err(anyhow::anyhow!("Symbol table overflow!"))
90            } else {
91                let new_symbol = (self.symbols.len() + 1) as i32;
92
93                self.table.insert(value.to_owned(), new_symbol);
94                self.symbols.push(value.to_owned());
95
96                Ok(new_symbol)
97            }
98        }
99    }
100
101    /// Retrieves the name of the given `Symbol`.
102    ///
103    /// # Examples
104    /// ```
105    /// # use jupiter::infograph::symbols::SymbolTable;
106    /// let mut table = SymbolTable::new();
107    ///
108    /// let symbol = table.find_or_create("Test").unwrap();
109    ///
110    /// // A known symbol can be looked up...
111    /// assert_eq!(table.lookup(symbol), "Test");
112    ///
113    /// // An unknown symbol is simply translated to ""
114    /// assert_eq!(table.lookup(1024), "");
115    /// ```
116    pub fn lookup(&self, symbol: Symbol) -> &str {
117        self.symbols
118            .get((symbol - 1) as usize)
119            .map(|string| string.as_str())
120            .unwrap_or("")
121    }
122
123    /// Determines the number of known symbols in the table.
124    ///
125    /// # Examples
126    /// ```
127    /// # use jupiter::infograph::symbols::SymbolTable;
128    /// let mut table = SymbolTable::new();
129    ///
130    /// // The same symbol is only added once to a table...
131    /// let symbol = table.find_or_create("Test").unwrap();
132    /// let symbol1 = table.find_or_create("Test").unwrap();
133    /// assert_eq!(symbol, symbol1);
134    ///
135    /// // ..therefore the table size is 1.
136    /// assert_eq!(table.len(), 1);
137    ///
138    /// // If we add another symbol...
139    /// table.find_or_create("Test 2").unwrap();
140    ///
141    /// // ...the size grows to 2.
142    /// assert_eq!(table.len(), 2);
143    /// ```
144    pub fn len(&self) -> usize {
145        self.symbols.len()
146    }
147
148    /// Estimates the allocated memory required to represent the symbol table.
149    ///
150    /// Note that this is only an approximation as some inner types to not reveal their
151    /// size.
152    ///
153    /// # Example
154    ///
155    /// ```
156    /// # use jupiter::infograph::symbols::SymbolTable;
157    /// let mut table = SymbolTable::new();
158    ///
159    /// table.find_or_create("Hello").unwrap();
160    /// table.find_or_create("World").unwrap();
161    ///
162    /// println!("{}", table.allocated_size());
163    /// ```
164    pub fn allocated_size(&self) -> usize {
165        // This is the semi-official way of estimating the size of the underlying hash table..
166        //
167        // Internally a bit more than the actual capacity is allocated to guarantee a proper load
168        // factor...
169        let table_size = self.table.capacity() * 11 / 10
170            // Per entry, the key, its value and a hash is stored...
171            * (std::mem::size_of::<usize>() + std::mem::size_of::<String>() + std::mem::size_of::<Symbol>());
172
173        // The lookup-table is known to be simply a Vec of strings...
174        let lookup_size = self.symbols.capacity() * std::mem::size_of::<String>();
175
176        // And of course we have to add up the bytes in each string...
177        let content_size: usize = self.symbols.iter().map(|string| string.len()).sum();
178
179        table_size + lookup_size + content_size
180    }
181}
182
183/// Provides a efficient lookup map using `Symbol` as key.
184///
185/// This map is optimized  as we know for sure, that the key is an `i32`. Also, we will use this
186/// map for objects within `Node` and therefore know that most commonly there will be only a few
187/// keys (most probably < 8 and almost always < 32).
188///
189/// Therefore we simply use a `Vec` of tuples to store the keys and their value. We also keep
190/// this vector sorted by the key so that a lookup can be done via `binary_search`.
191///
192/// Using this approach easily beats `HashMap` in both, performance and memory consumption.
193#[derive(Debug, PartialEq)]
194pub struct SymbolMap<V> {
195    entries: Vec<(Symbol, V)>,
196}
197
198impl<V: Default> SymbolMap<V> {
199    /// Creates a new and empty map.
200    ///
201    /// # Examples
202    ///
203    /// ```
204    /// # use jupiter::infograph::symbols::SymbolMap;
205    /// let mut map = SymbolMap::<i32>::new();
206    /// assert_eq!(map.len(), 0);
207    /// ```
208    pub fn new() -> Self {
209        SymbolMap {
210            entries: Vec::with_capacity(4),
211        }
212    }
213
214    /// Retrieves the value for the given key.
215    ///
216    /// # Examples
217    ///
218    /// ```
219    /// # use jupiter::infograph::symbols::SymbolMap;
220    /// let mut map = SymbolMap::<i32>::new();
221    ///
222    /// map.put(42, 1);
223    ///
224    /// assert_eq!(map.get(42).unwrap(), &1);
225    /// assert_eq!(map.get(99), None);
226    /// ```
227    pub fn get(&self, key: Symbol) -> Option<&V> {
228        if let Ok(pos) = self.entries.binary_search_by(|(x, _)| x.cmp(&key)) {
229            Some(&self.entries[pos].1)
230        } else {
231            None
232        }
233    }
234
235    /// Retrieves the value for the given key as mutable reference.
236    ///
237    /// # Examples
238    ///
239    /// ```
240    /// # use jupiter::infograph::symbols::SymbolMap;
241    /// let mut map = SymbolMap::<String>::new();
242    ///
243    /// map.put(42, "Hello".to_owned());
244    /// map.get_mut(42).unwrap().push_str(" World");
245    ///
246    /// assert_eq!(map.get(42).unwrap().as_str(), "Hello World");
247    /// ```
248    pub fn get_mut(&mut self, key: Symbol) -> Option<&mut V> {
249        if let Ok(pos) = self.entries.binary_search_by(|(x, _)| x.cmp(&key)) {
250            Some(&mut self.entries[pos].1)
251        } else {
252            None
253        }
254    }
255
256    /// Puts a value in the for the given key.
257    ///
258    /// # Examples
259    ///
260    /// ```
261    /// # use jupiter::infograph::symbols::SymbolMap;
262    /// let mut map = SymbolMap::<String>::new();
263    ///
264    /// map.put(42, "Hello".to_owned());
265    ///
266    /// assert_eq!(map.get(42).unwrap().as_str(), "Hello");
267    /// ```
268    pub fn put(&mut self, key: Symbol, value: V) {
269        match self.entries.binary_search_by(|(x, _)| x.cmp(&key)) {
270            Ok(pos) => self.entries[pos].1 = value,
271            Err(pos) => self.entries.insert(pos, (key, value)),
272        }
273    }
274
275    /// Counts the number of entries in the map.
276    ///
277    /// # Examples
278    ///
279    /// ```
280    /// # use jupiter::infograph::symbols::SymbolMap;
281    /// let mut map = SymbolMap::<String>::new();
282    ///
283    /// // If an entry is put in the map, its size is incremented...
284    /// map.put(42, "Hello".to_owned());
285    /// assert_eq!(map.len(), 1);
286    ///    
287    /// // If the entry is overwritten, the size remains the same...
288    /// map.put(42, "Hello".to_owned());
289    /// assert_eq!(map.len(), 1);
290    ///
291    /// // If another entry is added, its size grows once again...
292    /// map.put(99, "World".to_owned());
293    /// assert_eq!(map.len(), 2);
294    /// ```
295    pub fn len(&self) -> usize {
296        self.entries.len()
297    }
298
299    /// Reports the capacity currently reserved.
300    ///
301    /// # Examples
302    ///
303    /// ```
304    /// # use jupiter::infograph::symbols::SymbolMap;
305    /// let mut map = SymbolMap::<String>::new();
306    ///
307    /// let initial_capacity = map.capacity();
308    ///
309    /// for i in 1..=initial_capacity+1 {
310    ///     map.put(i as i32, "Hello".to_owned());
311    /// }
312    ///
313    /// assert_eq!(map.capacity() > initial_capacity, true);
314    /// ```
315    pub fn capacity(&self) -> usize {
316        self.entries.capacity()
317    }
318
319    /// Provides an iterator over all entries in the map.
320    ///
321    /// # Examples
322    ///
323    /// ```
324    /// # use jupiter::infograph::symbols::SymbolMap;
325    /// # use itertools::Itertools;
326    /// let mut map = SymbolMap::<String>::new();
327    ///
328    /// map.put(42, "Hello".to_owned());
329    /// map.put(99, "World".to_owned());
330    ///
331    /// let string = map.entries().map(|(key, value)| format!("{}: {}", key, value)).join(", ");
332    /// assert_eq!(string, "42: Hello, 99: World");
333    /// ```
334    pub fn entries(&self) -> Iter<(Symbol, V)> {
335        self.entries.iter()
336    }
337}
338
339#[cfg(test)]
340mod tests {
341    use crate::infograph::symbols::{SymbolMap, SymbolTable};
342
343    #[test]
344    pub fn resolve_and_lookup_works() {
345        let mut table = SymbolTable::new();
346
347        let symbol = table.find_or_create("Hello").unwrap();
348        assert_eq!(table.find_or_create("Hello").unwrap(), symbol);
349        assert_eq!(table.resolve("Hello").unwrap(), symbol);
350        assert_eq!(table.lookup(symbol), "Hello");
351        assert_eq!(table.lookup(-1), "");
352        assert_eq!(table.resolve("World").is_none(), true);
353        assert_eq!(table.len(), 1);
354    }
355
356    #[test]
357    pub fn get_and_put_work_for_symbol_map() {
358        let mut map = SymbolMap::new();
359        for symbol in (1..128).rev() {
360            map.put(symbol, symbol);
361        }
362        for symbol in 1..128 {
363            assert_eq!(*map.get(symbol).unwrap(), symbol);
364            assert_eq!(*map.get_mut(symbol).unwrap(), symbol);
365        }
366
367        assert_eq!(map.get(130).is_none(), true);
368        assert_eq!(map.get_mut(130).is_none(), true);
369
370        map.put(1, 42);
371        assert_eq!(*map.get(1).unwrap(), 42);
372        assert_eq!(*map.get_mut(1).unwrap(), 42);
373        for symbol in 2..128 {
374            assert_eq!(*map.get(symbol).unwrap(), symbol);
375            assert_eq!(*map.get_mut(symbol).unwrap(), symbol);
376        }
377
378        assert_eq!(map.len(), 127);
379        assert_eq!(
380            map.entries().map(|(symbol, _)| symbol).sum::<i32>(),
381            127 * 128 / 2
382        );
383    }
384}