jupiter_rs/infograph/symbols.rs
1//! Provides a `SymbolTable` and a `SymbolMap` for an efficient and compact representation
2//! of objects / inner maps.
3//!
4//! As most datasets consist of many objects which all share the same keys, we keep the actual
5//! strings in a symbol table and only store the symbols in the actual objects.
6//!
7//! This heavily reduces the memory being consumed and also permits a very cache friendly and there-
8//! fore super fast lookup for values in inner objects.
9use std::cmp::Ordering;
10use std::collections::HashMap;
11use std::slice::Iter;
12
13/// Defines the representation of a symbol.
14///
15/// Note that a single `Doc` (and its `SymbolTable') may only store up to 2^31 - 1 distinct symbols.
16pub type Symbol = i32;
17
18/// Used to resolve and lookup symbols.
19pub struct SymbolTable {
20 table: HashMap<String, Symbol>,
21 symbols: Vec<String>,
22}
23
24impl SymbolTable {
25 /// Creates a new and empty symbol table.
26 ///
27 /// # Example
28 ///
29 /// ```
30 /// # use jupiter::infograph::symbols::SymbolTable;
31 /// let table = SymbolTable::new();
32 ///
33 /// assert_eq!(table.len(), 0);
34 /// ```
35 pub fn new() -> Self {
36 SymbolTable {
37 table: HashMap::new(),
38 symbols: Vec::new(),
39 }
40 }
41
42 /// Tries to resolve the given `string` into an existing `Symbol`.
43 ///
44 /// If no symbol with the given name is known, `None` is returned. If a new
45 /// symbol should be created instead, use `find_or_create`.
46 ///
47 /// # Example
48 /// ```
49 /// # use jupiter::infograph::symbols::SymbolTable;
50 /// let mut table = SymbolTable::new();
51 ///
52 /// let symbol = table.find_or_create("Test").unwrap();
53 ///
54 /// assert_eq!(table.resolve("Test").unwrap(), symbol);
55 /// assert_eq!(table.resolve("Unknown").is_none(), true);
56 /// ```
57 pub fn resolve(&self, string: impl AsRef<str>) -> Option<Symbol> {
58 let value = string.as_ref();
59 if let Some(symbol) = self.table.get(value) {
60 Some(*symbol)
61 } else {
62 None
63 }
64 }
65
66 /// Resolve the given `string` into a new or an existing `Symbol`.
67 ///
68 /// If no symbol should be created if the given name is unknown, use `resolve`.
69 ///
70 /// # Errors
71 ///
72 /// This will return an error if the internal symbol table overflows (if there are more than
73 /// std::i32::MAX - 2 symbols).
74 ///
75 /// # Example
76 /// ```
77 /// # use jupiter::infograph::symbols::SymbolTable;
78 /// let mut table = SymbolTable::new();
79 ///
80 /// let symbol = table.find_or_create("Test").unwrap();
81 /// assert_eq!(table.resolve("Test").unwrap(), symbol);
82 /// ```
83 pub fn find_or_create(&mut self, string: impl AsRef<str>) -> anyhow::Result<Symbol> {
84 let value = string.as_ref();
85 if let Some(symbol) = self.table.get(value) {
86 Ok(*symbol)
87 } else {
88 if self.symbols.len() >= std::i32::MAX as usize {
89 Err(anyhow::anyhow!("Symbol table overflow!"))
90 } else {
91 let new_symbol = (self.symbols.len() + 1) as i32;
92
93 self.table.insert(value.to_owned(), new_symbol);
94 self.symbols.push(value.to_owned());
95
96 Ok(new_symbol)
97 }
98 }
99 }
100
101 /// Retrieves the name of the given `Symbol`.
102 ///
103 /// # Examples
104 /// ```
105 /// # use jupiter::infograph::symbols::SymbolTable;
106 /// let mut table = SymbolTable::new();
107 ///
108 /// let symbol = table.find_or_create("Test").unwrap();
109 ///
110 /// // A known symbol can be looked up...
111 /// assert_eq!(table.lookup(symbol), "Test");
112 ///
113 /// // An unknown symbol is simply translated to ""
114 /// assert_eq!(table.lookup(1024), "");
115 /// ```
116 pub fn lookup(&self, symbol: Symbol) -> &str {
117 self.symbols
118 .get((symbol - 1) as usize)
119 .map(|string| string.as_str())
120 .unwrap_or("")
121 }
122
123 /// Determines the number of known symbols in the table.
124 ///
125 /// # Examples
126 /// ```
127 /// # use jupiter::infograph::symbols::SymbolTable;
128 /// let mut table = SymbolTable::new();
129 ///
130 /// // The same symbol is only added once to a table...
131 /// let symbol = table.find_or_create("Test").unwrap();
132 /// let symbol1 = table.find_or_create("Test").unwrap();
133 /// assert_eq!(symbol, symbol1);
134 ///
135 /// // ..therefore the table size is 1.
136 /// assert_eq!(table.len(), 1);
137 ///
138 /// // If we add another symbol...
139 /// table.find_or_create("Test 2").unwrap();
140 ///
141 /// // ...the size grows to 2.
142 /// assert_eq!(table.len(), 2);
143 /// ```
144 pub fn len(&self) -> usize {
145 self.symbols.len()
146 }
147
148 /// Estimates the allocated memory required to represent the symbol table.
149 ///
150 /// Note that this is only an approximation as some inner types to not reveal their
151 /// size.
152 ///
153 /// # Example
154 ///
155 /// ```
156 /// # use jupiter::infograph::symbols::SymbolTable;
157 /// let mut table = SymbolTable::new();
158 ///
159 /// table.find_or_create("Hello").unwrap();
160 /// table.find_or_create("World").unwrap();
161 ///
162 /// println!("{}", table.allocated_size());
163 /// ```
164 pub fn allocated_size(&self) -> usize {
165 // This is the semi-official way of estimating the size of the underlying hash table..
166 //
167 // Internally a bit more than the actual capacity is allocated to guarantee a proper load
168 // factor...
169 let table_size = self.table.capacity() * 11 / 10
170 // Per entry, the key, its value and a hash is stored...
171 * (std::mem::size_of::<usize>() + std::mem::size_of::<String>() + std::mem::size_of::<Symbol>());
172
173 // The lookup-table is known to be simply a Vec of strings...
174 let lookup_size = self.symbols.capacity() * std::mem::size_of::<String>();
175
176 // And of course we have to add up the bytes in each string...
177 let content_size: usize = self.symbols.iter().map(|string| string.len()).sum();
178
179 table_size + lookup_size + content_size
180 }
181}
182
183/// Provides a efficient lookup map using `Symbol` as key.
184///
185/// This map is optimized as we know for sure, that the key is an `i32`. Also, we will use this
186/// map for objects within `Node` and therefore know that most commonly there will be only a few
187/// keys (most probably < 8 and almost always < 32).
188///
189/// Therefore we simply use a `Vec` of tuples to store the keys and their value. We also keep
190/// this vector sorted by the key so that a lookup can be done via `binary_search`.
191///
192/// Using this approach easily beats `HashMap` in both, performance and memory consumption.
193#[derive(Debug, PartialEq)]
194pub struct SymbolMap<V> {
195 entries: Vec<(Symbol, V)>,
196}
197
198impl<V: Default> SymbolMap<V> {
199 /// Creates a new and empty map.
200 ///
201 /// # Examples
202 ///
203 /// ```
204 /// # use jupiter::infograph::symbols::SymbolMap;
205 /// let mut map = SymbolMap::<i32>::new();
206 /// assert_eq!(map.len(), 0);
207 /// ```
208 pub fn new() -> Self {
209 SymbolMap {
210 entries: Vec::with_capacity(4),
211 }
212 }
213
214 /// Retrieves the value for the given key.
215 ///
216 /// # Examples
217 ///
218 /// ```
219 /// # use jupiter::infograph::symbols::SymbolMap;
220 /// let mut map = SymbolMap::<i32>::new();
221 ///
222 /// map.put(42, 1);
223 ///
224 /// assert_eq!(map.get(42).unwrap(), &1);
225 /// assert_eq!(map.get(99), None);
226 /// ```
227 pub fn get(&self, key: Symbol) -> Option<&V> {
228 if let Ok(pos) = self.entries.binary_search_by(|(x, _)| x.cmp(&key)) {
229 Some(&self.entries[pos].1)
230 } else {
231 None
232 }
233 }
234
235 /// Retrieves the value for the given key as mutable reference.
236 ///
237 /// # Examples
238 ///
239 /// ```
240 /// # use jupiter::infograph::symbols::SymbolMap;
241 /// let mut map = SymbolMap::<String>::new();
242 ///
243 /// map.put(42, "Hello".to_owned());
244 /// map.get_mut(42).unwrap().push_str(" World");
245 ///
246 /// assert_eq!(map.get(42).unwrap().as_str(), "Hello World");
247 /// ```
248 pub fn get_mut(&mut self, key: Symbol) -> Option<&mut V> {
249 if let Ok(pos) = self.entries.binary_search_by(|(x, _)| x.cmp(&key)) {
250 Some(&mut self.entries[pos].1)
251 } else {
252 None
253 }
254 }
255
256 /// Puts a value in the for the given key.
257 ///
258 /// # Examples
259 ///
260 /// ```
261 /// # use jupiter::infograph::symbols::SymbolMap;
262 /// let mut map = SymbolMap::<String>::new();
263 ///
264 /// map.put(42, "Hello".to_owned());
265 ///
266 /// assert_eq!(map.get(42).unwrap().as_str(), "Hello");
267 /// ```
268 pub fn put(&mut self, key: Symbol, value: V) {
269 match self.entries.binary_search_by(|(x, _)| x.cmp(&key)) {
270 Ok(pos) => self.entries[pos].1 = value,
271 Err(pos) => self.entries.insert(pos, (key, value)),
272 }
273 }
274
275 /// Counts the number of entries in the map.
276 ///
277 /// # Examples
278 ///
279 /// ```
280 /// # use jupiter::infograph::symbols::SymbolMap;
281 /// let mut map = SymbolMap::<String>::new();
282 ///
283 /// // If an entry is put in the map, its size is incremented...
284 /// map.put(42, "Hello".to_owned());
285 /// assert_eq!(map.len(), 1);
286 ///
287 /// // If the entry is overwritten, the size remains the same...
288 /// map.put(42, "Hello".to_owned());
289 /// assert_eq!(map.len(), 1);
290 ///
291 /// // If another entry is added, its size grows once again...
292 /// map.put(99, "World".to_owned());
293 /// assert_eq!(map.len(), 2);
294 /// ```
295 pub fn len(&self) -> usize {
296 self.entries.len()
297 }
298
299 /// Reports the capacity currently reserved.
300 ///
301 /// # Examples
302 ///
303 /// ```
304 /// # use jupiter::infograph::symbols::SymbolMap;
305 /// let mut map = SymbolMap::<String>::new();
306 ///
307 /// let initial_capacity = map.capacity();
308 ///
309 /// for i in 1..=initial_capacity+1 {
310 /// map.put(i as i32, "Hello".to_owned());
311 /// }
312 ///
313 /// assert_eq!(map.capacity() > initial_capacity, true);
314 /// ```
315 pub fn capacity(&self) -> usize {
316 self.entries.capacity()
317 }
318
319 /// Provides an iterator over all entries in the map.
320 ///
321 /// # Examples
322 ///
323 /// ```
324 /// # use jupiter::infograph::symbols::SymbolMap;
325 /// # use itertools::Itertools;
326 /// let mut map = SymbolMap::<String>::new();
327 ///
328 /// map.put(42, "Hello".to_owned());
329 /// map.put(99, "World".to_owned());
330 ///
331 /// let string = map.entries().map(|(key, value)| format!("{}: {}", key, value)).join(", ");
332 /// assert_eq!(string, "42: Hello, 99: World");
333 /// ```
334 pub fn entries(&self) -> Iter<(Symbol, V)> {
335 self.entries.iter()
336 }
337}
338
339#[cfg(test)]
340mod tests {
341 use crate::infograph::symbols::{SymbolMap, SymbolTable};
342
343 #[test]
344 pub fn resolve_and_lookup_works() {
345 let mut table = SymbolTable::new();
346
347 let symbol = table.find_or_create("Hello").unwrap();
348 assert_eq!(table.find_or_create("Hello").unwrap(), symbol);
349 assert_eq!(table.resolve("Hello").unwrap(), symbol);
350 assert_eq!(table.lookup(symbol), "Hello");
351 assert_eq!(table.lookup(-1), "");
352 assert_eq!(table.resolve("World").is_none(), true);
353 assert_eq!(table.len(), 1);
354 }
355
356 #[test]
357 pub fn get_and_put_work_for_symbol_map() {
358 let mut map = SymbolMap::new();
359 for symbol in (1..128).rev() {
360 map.put(symbol, symbol);
361 }
362 for symbol in 1..128 {
363 assert_eq!(*map.get(symbol).unwrap(), symbol);
364 assert_eq!(*map.get_mut(symbol).unwrap(), symbol);
365 }
366
367 assert_eq!(map.get(130).is_none(), true);
368 assert_eq!(map.get_mut(130).is_none(), true);
369
370 map.put(1, 42);
371 assert_eq!(*map.get(1).unwrap(), 42);
372 assert_eq!(*map.get_mut(1).unwrap(), 42);
373 for symbol in 2..128 {
374 assert_eq!(*map.get(symbol).unwrap(), symbol);
375 assert_eq!(*map.get_mut(symbol).unwrap(), symbol);
376 }
377
378 assert_eq!(map.len(), 127);
379 assert_eq!(
380 map.entries().map(|(symbol, _)| symbol).sum::<i32>(),
381 127 * 128 / 2
382 );
383 }
384}