sochdb_core/
string_interner.rs1use parking_lot::RwLock;
53use std::collections::HashMap;
54use std::sync::atomic::{AtomicU32, Ordering};
55
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
62pub struct Symbol(u32);
63
64impl Symbol {
65 pub fn from_raw(value: u32) -> Self {
67 Symbol(value)
68 }
69
70 pub fn as_raw(&self) -> u32 {
72 self.0
73 }
74}
75
76impl From<u32> for Symbol {
77 fn from(value: u32) -> Self {
78 Symbol(value)
79 }
80}
81
82impl From<Symbol> for u32 {
83 fn from(symbol: Symbol) -> Self {
84 symbol.0
85 }
86}
87
88#[derive(Debug)]
93pub struct StringInterner {
94 string_to_symbol: HashMap<String, Symbol>,
96 symbol_to_string: Vec<String>,
98 next_symbol: u32,
100}
101
102impl StringInterner {
103 pub fn new() -> Self {
105 Self {
106 string_to_symbol: HashMap::new(),
107 symbol_to_string: Vec::new(),
108 next_symbol: 0,
109 }
110 }
111
112 pub fn with_capacity(capacity: usize) -> Self {
114 Self {
115 string_to_symbol: HashMap::with_capacity(capacity),
116 symbol_to_string: Vec::with_capacity(capacity),
117 next_symbol: 0,
118 }
119 }
120
121 pub fn get_or_intern(&mut self, s: &str) -> Symbol {
126 if let Some(&symbol) = self.string_to_symbol.get(s) {
127 return symbol;
128 }
129
130 let symbol = Symbol(self.next_symbol);
131 self.next_symbol += 1;
132
133 self.string_to_symbol.insert(s.to_string(), symbol);
134 self.symbol_to_string.push(s.to_string());
135
136 symbol
137 }
138
139 pub fn get_or_intern_owned(&mut self, s: String) -> Symbol {
141 if let Some(&symbol) = self.string_to_symbol.get(&s) {
142 return symbol;
143 }
144
145 let symbol = Symbol(self.next_symbol);
146 self.next_symbol += 1;
147
148 self.symbol_to_string.push(s.clone());
149 self.string_to_symbol.insert(s, symbol);
150
151 symbol
152 }
153
154 pub fn get(&self, s: &str) -> Option<Symbol> {
158 self.string_to_symbol.get(s).copied()
159 }
160
161 pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
165 self.symbol_to_string
166 .get(symbol.0 as usize)
167 .map(|s| s.as_str())
168 }
169
170 pub fn len(&self) -> usize {
172 self.symbol_to_string.len()
173 }
174
175 pub fn is_empty(&self) -> bool {
177 self.symbol_to_string.is_empty()
178 }
179
180 pub fn memory_usage(&self) -> usize {
182 let map_overhead = self.string_to_symbol.len() * (3 * std::mem::size_of::<usize>());
184
185 let string_storage: usize = self
187 .symbol_to_string
188 .iter()
189 .map(|s| s.len() + std::mem::size_of::<String>())
190 .sum();
191
192 let vec_overhead = std::mem::size_of::<Vec<String>>()
194 + self.symbol_to_string.capacity() * std::mem::size_of::<String>();
195
196 map_overhead + string_storage + vec_overhead
197 }
198}
199
200impl Default for StringInterner {
201 fn default() -> Self {
202 Self::new()
203 }
204}
205
206const SHARD_COUNT: usize = 16;
208
209#[derive(Debug)]
214pub struct ConcurrentStringInterner {
215 shards: [RwLock<HashMap<String, Symbol>>; SHARD_COUNT],
217 symbols: RwLock<Vec<String>>,
219 next_symbol: AtomicU32,
221}
222
223impl ConcurrentStringInterner {
224 pub fn new() -> Self {
226 Self {
227 shards: std::array::from_fn(|_| RwLock::new(HashMap::new())),
228 symbols: RwLock::new(Vec::new()),
229 next_symbol: AtomicU32::new(0),
230 }
231 }
232
233 pub fn with_capacity(capacity: usize) -> Self {
235 let per_shard = capacity / SHARD_COUNT + 1;
236 Self {
237 shards: std::array::from_fn(|_| RwLock::new(HashMap::with_capacity(per_shard))),
238 symbols: RwLock::new(Vec::with_capacity(capacity)),
239 next_symbol: AtomicU32::new(0),
240 }
241 }
242
243 #[inline]
245 fn shard_for(&self, s: &str) -> usize {
246 let mut hash: u64 = 0xcbf29ce484222325;
248 for byte in s.bytes() {
249 hash ^= byte as u64;
250 hash = hash.wrapping_mul(0x100000001b3);
251 }
252 (hash as usize) % SHARD_COUNT
253 }
254
255 pub fn get_or_intern(&self, s: &str) -> Symbol {
257 let shard_idx = self.shard_for(s);
258
259 {
261 let shard = self.shards[shard_idx].read();
262 if let Some(&symbol) = shard.get(s) {
263 return symbol;
264 }
265 }
266
267 let mut shard = self.shards[shard_idx].write();
269
270 if let Some(&symbol) = shard.get(s) {
272 return symbol;
273 }
274
275 let symbol = Symbol(self.next_symbol.fetch_add(1, Ordering::SeqCst));
277
278 {
280 let mut symbols = self.symbols.write();
281 while symbols.len() <= symbol.0 as usize {
283 symbols.push(String::new());
284 }
285 symbols[symbol.0 as usize] = s.to_string();
286 }
287
288 shard.insert(s.to_string(), symbol);
290
291 symbol
292 }
293
294 pub fn get(&self, s: &str) -> Option<Symbol> {
296 let shard_idx = self.shard_for(s);
297 let shard = self.shards[shard_idx].read();
298 shard.get(s).copied()
299 }
300
301 pub fn resolve(&self, symbol: Symbol) -> Option<String> {
303 let symbols = self.symbols.read();
304 symbols.get(symbol.0 as usize).cloned()
305 }
306
307 pub fn len(&self) -> usize {
309 self.next_symbol.load(Ordering::Relaxed) as usize
310 }
311
312 pub fn is_empty(&self) -> bool {
314 self.len() == 0
315 }
316
317 pub fn memory_usage(&self) -> usize {
319 let symbols = self.symbols.read();
320
321 let string_storage: usize = symbols
323 .iter()
324 .map(|s| s.len() + std::mem::size_of::<String>())
325 .sum();
326
327 let shard_overhead =
329 SHARD_COUNT * 3 * std::mem::size_of::<usize>() * self.len() / SHARD_COUNT;
330
331 string_storage + shard_overhead
332 }
333}
334
335impl Default for ConcurrentStringInterner {
336 fn default() -> Self {
337 Self::new()
338 }
339}
340
341unsafe impl Send for ConcurrentStringInterner {}
343unsafe impl Sync for ConcurrentStringInterner {}
344
345pub mod global {
350 use super::*;
351 use std::sync::OnceLock;
352
353 static GLOBAL_INTERNER: OnceLock<ConcurrentStringInterner> = OnceLock::new();
354
355 pub fn interner() -> &'static ConcurrentStringInterner {
357 GLOBAL_INTERNER.get_or_init(ConcurrentStringInterner::new)
358 }
359
360 pub fn intern(s: &str) -> Symbol {
362 interner().get_or_intern(s)
363 }
364
365 pub fn resolve(symbol: Symbol) -> Option<String> {
367 interner().resolve(symbol)
368 }
369
370 pub fn memory_usage() -> usize {
372 interner().memory_usage()
373 }
374}
375
376#[cfg(test)]
377mod tests {
378 use super::*;
379 use std::thread;
380
381 #[test]
382 fn test_string_interner_basic() {
383 let mut interner = StringInterner::new();
384
385 let sym1 = interner.get_or_intern("hello");
386 let sym2 = interner.get_or_intern("world");
387 let sym3 = interner.get_or_intern("hello"); assert_eq!(sym1, sym3);
390 assert_ne!(sym1, sym2);
391
392 assert_eq!(interner.resolve(sym1), Some("hello"));
393 assert_eq!(interner.resolve(sym2), Some("world"));
394 }
395
396 #[test]
397 fn test_string_interner_get() {
398 let mut interner = StringInterner::new();
399
400 assert_eq!(interner.get("hello"), None);
401
402 let sym = interner.get_or_intern("hello");
403 assert_eq!(interner.get("hello"), Some(sym));
404 }
405
406 #[test]
407 fn test_string_interner_owned() {
408 let mut interner = StringInterner::new();
409
410 let sym1 = interner.get_or_intern_owned("hello".to_string());
411 let sym2 = interner.get_or_intern_owned("hello".to_string());
412
413 assert_eq!(sym1, sym2);
414 }
415
416 #[test]
417 fn test_string_interner_len() {
418 let mut interner = StringInterner::new();
419
420 assert_eq!(interner.len(), 0);
421 assert!(interner.is_empty());
422
423 interner.get_or_intern("hello");
424 assert_eq!(interner.len(), 1);
425
426 interner.get_or_intern("world");
427 assert_eq!(interner.len(), 2);
428
429 interner.get_or_intern("hello"); assert_eq!(interner.len(), 2);
431 }
432
433 #[test]
434 fn test_concurrent_interner_basic() {
435 let interner = ConcurrentStringInterner::new();
436
437 let sym1 = interner.get_or_intern("hello");
438 let sym2 = interner.get_or_intern("world");
439 let sym3 = interner.get_or_intern("hello");
440
441 assert_eq!(sym1, sym3);
442 assert_ne!(sym1, sym2);
443
444 assert_eq!(interner.resolve(sym1), Some("hello".to_string()));
445 assert_eq!(interner.resolve(sym2), Some("world".to_string()));
446 }
447
448 #[test]
449 fn test_concurrent_interner_threaded() {
450 use std::sync::Arc;
451
452 let interner = Arc::new(ConcurrentStringInterner::new());
453 let mut handles = vec![];
454
455 for _i in 0..8 {
457 let interner = Arc::clone(&interner);
458 let handle = thread::spawn(move || {
459 for j in 0..100 {
460 let s = format!("string_{}", j % 50);
461 interner.get_or_intern(&s);
462 }
463 });
464 handles.push(handle);
465 }
466
467 for handle in handles {
468 handle.join().unwrap();
469 }
470
471 assert_eq!(interner.len(), 50);
473 }
474
475 #[test]
476 fn test_path_segment_interning() {
477 let mut interner = StringInterner::new();
478
479 let paths = [
481 "users.profile.settings.theme",
482 "users.profile.settings.language",
483 "users.profile.name",
484 "products.inventory.count",
485 "products.inventory.location",
486 ];
487
488 for path in &paths {
489 for segment in path.split('.') {
490 interner.get_or_intern(segment);
491 }
492 }
493
494 assert_eq!(interner.len(), 10);
497 }
498
499 #[test]
500 fn test_symbol_serialization() {
501 let mut interner = StringInterner::new();
502
503 let sym = interner.get_or_intern("test");
504 let raw = sym.as_raw();
505 let sym2 = Symbol::from_raw(raw);
506
507 assert_eq!(sym, sym2);
508 }
509
510 #[test]
511 fn test_global_interner() {
512 let sym1 = global::intern("global_test");
513 let sym2 = global::intern("global_test");
514
515 assert_eq!(sym1, sym2);
516 assert_eq!(global::resolve(sym1), Some("global_test".to_string()));
517 }
518
519 #[test]
520 fn test_memory_usage() {
521 let mut interner = StringInterner::new();
522
523 for i in 0..1000 {
524 interner.get_or_intern(&format!("string_{}", i));
525 }
526
527 let usage = interner.memory_usage();
528 assert!(usage < 100_000, "Memory usage too high: {}", usage);
530 }
531
532 #[test]
533 fn test_empty_string() {
534 let mut interner = StringInterner::new();
535
536 let sym = interner.get_or_intern("");
537 assert_eq!(interner.resolve(sym), Some(""));
538 }
539
540 #[test]
541 fn test_unicode_strings() {
542 let mut interner = StringInterner::new();
543
544 let sym1 = interner.get_or_intern("こんにちは");
545 let sym2 = interner.get_or_intern("世界");
546 let sym3 = interner.get_or_intern("こんにちは");
547
548 assert_eq!(sym1, sym3);
549 assert_ne!(sym1, sym2);
550
551 assert_eq!(interner.resolve(sym1), Some("こんにちは"));
552 assert_eq!(interner.resolve(sym2), Some("世界"));
553 }
554}