sochdb_core/
string_interner.rs1use parking_lot::RwLock;
50use std::collections::HashMap;
51use std::sync::atomic::{AtomicU32, Ordering};
52
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
59pub struct Symbol(u32);
60
61impl Symbol {
62 pub fn from_raw(value: u32) -> Self {
64 Symbol(value)
65 }
66
67 pub fn as_raw(&self) -> u32 {
69 self.0
70 }
71}
72
73impl From<u32> for Symbol {
74 fn from(value: u32) -> Self {
75 Symbol(value)
76 }
77}
78
79impl From<Symbol> for u32 {
80 fn from(symbol: Symbol) -> Self {
81 symbol.0
82 }
83}
84
85#[derive(Debug)]
90pub struct StringInterner {
91 string_to_symbol: HashMap<String, Symbol>,
93 symbol_to_string: Vec<String>,
95 next_symbol: u32,
97}
98
99impl StringInterner {
100 pub fn new() -> Self {
102 Self {
103 string_to_symbol: HashMap::new(),
104 symbol_to_string: Vec::new(),
105 next_symbol: 0,
106 }
107 }
108
109 pub fn with_capacity(capacity: usize) -> Self {
111 Self {
112 string_to_symbol: HashMap::with_capacity(capacity),
113 symbol_to_string: Vec::with_capacity(capacity),
114 next_symbol: 0,
115 }
116 }
117
118 pub fn get_or_intern(&mut self, s: &str) -> Symbol {
123 if let Some(&symbol) = self.string_to_symbol.get(s) {
124 return symbol;
125 }
126
127 let symbol = Symbol(self.next_symbol);
128 self.next_symbol += 1;
129
130 self.string_to_symbol.insert(s.to_string(), symbol);
131 self.symbol_to_string.push(s.to_string());
132
133 symbol
134 }
135
136 pub fn get_or_intern_owned(&mut self, s: String) -> Symbol {
138 if let Some(&symbol) = self.string_to_symbol.get(&s) {
139 return symbol;
140 }
141
142 let symbol = Symbol(self.next_symbol);
143 self.next_symbol += 1;
144
145 self.symbol_to_string.push(s.clone());
146 self.string_to_symbol.insert(s, symbol);
147
148 symbol
149 }
150
151 pub fn get(&self, s: &str) -> Option<Symbol> {
155 self.string_to_symbol.get(s).copied()
156 }
157
158 pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
162 self.symbol_to_string
163 .get(symbol.0 as usize)
164 .map(|s| s.as_str())
165 }
166
167 pub fn len(&self) -> usize {
169 self.symbol_to_string.len()
170 }
171
172 pub fn is_empty(&self) -> bool {
174 self.symbol_to_string.is_empty()
175 }
176
177 pub fn memory_usage(&self) -> usize {
179 let map_overhead = self.string_to_symbol.len() * (3 * std::mem::size_of::<usize>());
181
182 let string_storage: usize = self
184 .symbol_to_string
185 .iter()
186 .map(|s| s.len() + std::mem::size_of::<String>())
187 .sum();
188
189 let vec_overhead = std::mem::size_of::<Vec<String>>()
191 + self.symbol_to_string.capacity() * std::mem::size_of::<String>();
192
193 map_overhead + string_storage + vec_overhead
194 }
195}
196
197impl Default for StringInterner {
198 fn default() -> Self {
199 Self::new()
200 }
201}
202
203const SHARD_COUNT: usize = 16;
205
206#[derive(Debug)]
211pub struct ConcurrentStringInterner {
212 shards: [RwLock<HashMap<String, Symbol>>; SHARD_COUNT],
214 symbols: RwLock<Vec<String>>,
216 next_symbol: AtomicU32,
218}
219
220impl ConcurrentStringInterner {
221 pub fn new() -> Self {
223 Self {
224 shards: std::array::from_fn(|_| RwLock::new(HashMap::new())),
225 symbols: RwLock::new(Vec::new()),
226 next_symbol: AtomicU32::new(0),
227 }
228 }
229
230 pub fn with_capacity(capacity: usize) -> Self {
232 let per_shard = capacity / SHARD_COUNT + 1;
233 Self {
234 shards: std::array::from_fn(|_| RwLock::new(HashMap::with_capacity(per_shard))),
235 symbols: RwLock::new(Vec::with_capacity(capacity)),
236 next_symbol: AtomicU32::new(0),
237 }
238 }
239
240 #[inline]
242 fn shard_for(&self, s: &str) -> usize {
243 let mut hash: u64 = 0xcbf29ce484222325;
245 for byte in s.bytes() {
246 hash ^= byte as u64;
247 hash = hash.wrapping_mul(0x100000001b3);
248 }
249 (hash as usize) % SHARD_COUNT
250 }
251
252 pub fn get_or_intern(&self, s: &str) -> Symbol {
254 let shard_idx = self.shard_for(s);
255
256 {
258 let shard = self.shards[shard_idx].read();
259 if let Some(&symbol) = shard.get(s) {
260 return symbol;
261 }
262 }
263
264 let mut shard = self.shards[shard_idx].write();
266
267 if let Some(&symbol) = shard.get(s) {
269 return symbol;
270 }
271
272 let symbol = Symbol(self.next_symbol.fetch_add(1, Ordering::SeqCst));
274
275 {
277 let mut symbols = self.symbols.write();
278 while symbols.len() <= symbol.0 as usize {
280 symbols.push(String::new());
281 }
282 symbols[symbol.0 as usize] = s.to_string();
283 }
284
285 shard.insert(s.to_string(), symbol);
287
288 symbol
289 }
290
291 pub fn get(&self, s: &str) -> Option<Symbol> {
293 let shard_idx = self.shard_for(s);
294 let shard = self.shards[shard_idx].read();
295 shard.get(s).copied()
296 }
297
298 pub fn resolve(&self, symbol: Symbol) -> Option<String> {
300 let symbols = self.symbols.read();
301 symbols.get(symbol.0 as usize).cloned()
302 }
303
304 pub fn len(&self) -> usize {
306 self.next_symbol.load(Ordering::Relaxed) as usize
307 }
308
309 pub fn is_empty(&self) -> bool {
311 self.len() == 0
312 }
313
314 pub fn memory_usage(&self) -> usize {
316 let symbols = self.symbols.read();
317
318 let string_storage: usize = symbols
320 .iter()
321 .map(|s| s.len() + std::mem::size_of::<String>())
322 .sum();
323
324 let shard_overhead =
326 SHARD_COUNT * 3 * std::mem::size_of::<usize>() * self.len() / SHARD_COUNT;
327
328 string_storage + shard_overhead
329 }
330}
331
332impl Default for ConcurrentStringInterner {
333 fn default() -> Self {
334 Self::new()
335 }
336}
337
338unsafe impl Send for ConcurrentStringInterner {}
340unsafe impl Sync for ConcurrentStringInterner {}
341
342pub mod global {
347 use super::*;
348 use std::sync::OnceLock;
349
350 static GLOBAL_INTERNER: OnceLock<ConcurrentStringInterner> = OnceLock::new();
351
352 pub fn interner() -> &'static ConcurrentStringInterner {
354 GLOBAL_INTERNER.get_or_init(ConcurrentStringInterner::new)
355 }
356
357 pub fn intern(s: &str) -> Symbol {
359 interner().get_or_intern(s)
360 }
361
362 pub fn resolve(symbol: Symbol) -> Option<String> {
364 interner().resolve(symbol)
365 }
366
367 pub fn memory_usage() -> usize {
369 interner().memory_usage()
370 }
371}
372
373#[cfg(test)]
374mod tests {
375 use super::*;
376 use std::thread;
377
378 #[test]
379 fn test_string_interner_basic() {
380 let mut interner = StringInterner::new();
381
382 let sym1 = interner.get_or_intern("hello");
383 let sym2 = interner.get_or_intern("world");
384 let sym3 = interner.get_or_intern("hello"); assert_eq!(sym1, sym3);
387 assert_ne!(sym1, sym2);
388
389 assert_eq!(interner.resolve(sym1), Some("hello"));
390 assert_eq!(interner.resolve(sym2), Some("world"));
391 }
392
393 #[test]
394 fn test_string_interner_get() {
395 let mut interner = StringInterner::new();
396
397 assert_eq!(interner.get("hello"), None);
398
399 let sym = interner.get_or_intern("hello");
400 assert_eq!(interner.get("hello"), Some(sym));
401 }
402
403 #[test]
404 fn test_string_interner_owned() {
405 let mut interner = StringInterner::new();
406
407 let sym1 = interner.get_or_intern_owned("hello".to_string());
408 let sym2 = interner.get_or_intern_owned("hello".to_string());
409
410 assert_eq!(sym1, sym2);
411 }
412
413 #[test]
414 fn test_string_interner_len() {
415 let mut interner = StringInterner::new();
416
417 assert_eq!(interner.len(), 0);
418 assert!(interner.is_empty());
419
420 interner.get_or_intern("hello");
421 assert_eq!(interner.len(), 1);
422
423 interner.get_or_intern("world");
424 assert_eq!(interner.len(), 2);
425
426 interner.get_or_intern("hello"); assert_eq!(interner.len(), 2);
428 }
429
430 #[test]
431 fn test_concurrent_interner_basic() {
432 let interner = ConcurrentStringInterner::new();
433
434 let sym1 = interner.get_or_intern("hello");
435 let sym2 = interner.get_or_intern("world");
436 let sym3 = interner.get_or_intern("hello");
437
438 assert_eq!(sym1, sym3);
439 assert_ne!(sym1, sym2);
440
441 assert_eq!(interner.resolve(sym1), Some("hello".to_string()));
442 assert_eq!(interner.resolve(sym2), Some("world".to_string()));
443 }
444
445 #[test]
446 fn test_concurrent_interner_threaded() {
447 use std::sync::Arc;
448
449 let interner = Arc::new(ConcurrentStringInterner::new());
450 let mut handles = vec![];
451
452 for _i in 0..8 {
454 let interner = Arc::clone(&interner);
455 let handle = thread::spawn(move || {
456 for j in 0..100 {
457 let s = format!("string_{}", j % 50);
458 interner.get_or_intern(&s);
459 }
460 });
461 handles.push(handle);
462 }
463
464 for handle in handles {
465 handle.join().unwrap();
466 }
467
468 assert_eq!(interner.len(), 50);
470 }
471
472 #[test]
473 fn test_path_segment_interning() {
474 let mut interner = StringInterner::new();
475
476 let paths = [
478 "users.profile.settings.theme",
479 "users.profile.settings.language",
480 "users.profile.name",
481 "products.inventory.count",
482 "products.inventory.location",
483 ];
484
485 for path in &paths {
486 for segment in path.split('.') {
487 interner.get_or_intern(segment);
488 }
489 }
490
491 assert_eq!(interner.len(), 10);
494 }
495
496 #[test]
497 fn test_symbol_serialization() {
498 let mut interner = StringInterner::new();
499
500 let sym = interner.get_or_intern("test");
501 let raw = sym.as_raw();
502 let sym2 = Symbol::from_raw(raw);
503
504 assert_eq!(sym, sym2);
505 }
506
507 #[test]
508 fn test_global_interner() {
509 let sym1 = global::intern("global_test");
510 let sym2 = global::intern("global_test");
511
512 assert_eq!(sym1, sym2);
513 assert_eq!(global::resolve(sym1), Some("global_test".to_string()));
514 }
515
516 #[test]
517 fn test_memory_usage() {
518 let mut interner = StringInterner::new();
519
520 for i in 0..1000 {
521 interner.get_or_intern(&format!("string_{}", i));
522 }
523
524 let usage = interner.memory_usage();
525 assert!(usage < 100_000, "Memory usage too high: {}", usage);
527 }
528
529 #[test]
530 fn test_empty_string() {
531 let mut interner = StringInterner::new();
532
533 let sym = interner.get_or_intern("");
534 assert_eq!(interner.resolve(sym), Some(""));
535 }
536
537 #[test]
538 fn test_unicode_strings() {
539 let mut interner = StringInterner::new();
540
541 let sym1 = interner.get_or_intern("こんにちは");
542 let sym2 = interner.get_or_intern("世界");
543 let sym3 = interner.get_or_intern("こんにちは");
544
545 assert_eq!(sym1, sym3);
546 assert_ne!(sym1, sym2);
547
548 assert_eq!(interner.resolve(sym1), Some("こんにちは"));
549 assert_eq!(interner.resolve(sym2), Some("世界"));
550 }
551}