sochdb_core/
string_interner.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! String Interning for Path Segments
16//!
17//! This module implements string interning to reduce memory usage when
18//! storing repeated path segments in the PathTrie and other data structures.
19//!
20//! ## Memory Savings
21//!
22//! For paths like "users.profile.settings.theme", the segment "users" may
23//! appear thousands of times across different paths. String interning
24//! ensures each unique segment is stored only once.
25//!
26//! ## Example
27//!
28//! ```rust,ignore
29//! use sochdb_core::string_interner::StringInterner;
30//!
31//! let mut interner = StringInterner::new();
32//!
33//! // Intern a string - returns a small integer symbol
34//! let sym1 = interner.get_or_intern("users");
35//! let sym2 = interner.get_or_intern("users"); // Same symbol returned
36//!
37//! assert_eq!(sym1, sym2); // Same string = same symbol
38//!
39//! // Resolve back to string
40//! let s = interner.resolve(sym1);
41//! assert_eq!(s, Some("users"));
42//! ```
43//!
44//! ## Thread Safety
45//!
46//! The `ConcurrentStringInterner` variant is thread-safe and uses a
47//! sharded lock design for concurrent access.
48
49use parking_lot::RwLock;
50use std::collections::HashMap;
51use std::sync::atomic::{AtomicU32, Ordering};
52
53/// A symbol representing an interned string
54///
55/// This is a small integer that can be used to look up the original string.
56/// Using symbols instead of strings reduces memory usage when the same
57/// string appears multiple times.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
59pub struct Symbol(u32);
60
61impl Symbol {
62    /// Create a symbol from a raw u32 (for deserialization)
63    pub fn from_raw(value: u32) -> Self {
64        Symbol(value)
65    }
66
67    /// Get the raw u32 value (for serialization)
68    pub fn as_raw(&self) -> u32 {
69        self.0
70    }
71}
72
73impl From<u32> for Symbol {
74    fn from(value: u32) -> Self {
75        Symbol(value)
76    }
77}
78
79impl From<Symbol> for u32 {
80    fn from(symbol: Symbol) -> Self {
81        symbol.0
82    }
83}
84
85/// A string interner that stores unique strings and returns symbols
86///
87/// This is the single-threaded version. For concurrent access, use
88/// `ConcurrentStringInterner`.
89#[derive(Debug)]
90pub struct StringInterner {
91    /// Map from string to symbol
92    string_to_symbol: HashMap<String, Symbol>,
93    /// Map from symbol to string (for resolve)
94    symbol_to_string: Vec<String>,
95    /// Next symbol to assign
96    next_symbol: u32,
97}
98
99impl StringInterner {
100    /// Create a new empty string interner
101    pub fn new() -> Self {
102        Self {
103            string_to_symbol: HashMap::new(),
104            symbol_to_string: Vec::new(),
105            next_symbol: 0,
106        }
107    }
108
109    /// Create a new string interner with pre-allocated capacity
110    pub fn with_capacity(capacity: usize) -> Self {
111        Self {
112            string_to_symbol: HashMap::with_capacity(capacity),
113            symbol_to_string: Vec::with_capacity(capacity),
114            next_symbol: 0,
115        }
116    }
117
118    /// Intern a string and return its symbol
119    ///
120    /// If the string is already interned, returns the existing symbol.
121    /// Otherwise, creates a new symbol for this string.
122    pub fn get_or_intern(&mut self, s: &str) -> Symbol {
123        if let Some(&symbol) = self.string_to_symbol.get(s) {
124            return symbol;
125        }
126
127        let symbol = Symbol(self.next_symbol);
128        self.next_symbol += 1;
129
130        self.string_to_symbol.insert(s.to_string(), symbol);
131        self.symbol_to_string.push(s.to_string());
132
133        symbol
134    }
135
136    /// Intern an owned string (avoids clone if string is new)
137    pub fn get_or_intern_owned(&mut self, s: String) -> Symbol {
138        if let Some(&symbol) = self.string_to_symbol.get(&s) {
139            return symbol;
140        }
141
142        let symbol = Symbol(self.next_symbol);
143        self.next_symbol += 1;
144
145        self.symbol_to_string.push(s.clone());
146        self.string_to_symbol.insert(s, symbol);
147
148        symbol
149    }
150
151    /// Look up an already-interned string
152    ///
153    /// Returns None if the string has not been interned
154    pub fn get(&self, s: &str) -> Option<Symbol> {
155        self.string_to_symbol.get(s).copied()
156    }
157
158    /// Resolve a symbol to its string
159    ///
160    /// Returns None if the symbol is invalid
161    pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
162        self.symbol_to_string
163            .get(symbol.0 as usize)
164            .map(|s| s.as_str())
165    }
166
167    /// Get the number of interned strings
168    pub fn len(&self) -> usize {
169        self.symbol_to_string.len()
170    }
171
172    /// Check if the interner is empty
173    pub fn is_empty(&self) -> bool {
174        self.symbol_to_string.is_empty()
175    }
176
177    /// Get memory usage estimate in bytes
178    pub fn memory_usage(&self) -> usize {
179        // HashMap overhead (rough estimate: 3 words per entry)
180        let map_overhead = self.string_to_symbol.len() * (3 * std::mem::size_of::<usize>());
181
182        // String storage
183        let string_storage: usize = self
184            .symbol_to_string
185            .iter()
186            .map(|s| s.len() + std::mem::size_of::<String>())
187            .sum();
188
189        // Vec overhead
190        let vec_overhead = std::mem::size_of::<Vec<String>>()
191            + self.symbol_to_string.capacity() * std::mem::size_of::<String>();
192
193        map_overhead + string_storage + vec_overhead
194    }
195}
196
197impl Default for StringInterner {
198    fn default() -> Self {
199        Self::new()
200    }
201}
202
203/// Number of shards for the concurrent interner
204const SHARD_COUNT: usize = 16;
205
206/// Thread-safe string interner using sharded locks
207///
208/// Uses a simple hash-based sharding to reduce lock contention
209/// when multiple threads are interning strings concurrently.
210#[derive(Debug)]
211pub struct ConcurrentStringInterner {
212    /// Sharded map from string to symbol
213    shards: [RwLock<HashMap<String, Symbol>>; SHARD_COUNT],
214    /// Global symbol table (append-only)
215    symbols: RwLock<Vec<String>>,
216    /// Next symbol to assign (atomic for lock-free reads)
217    next_symbol: AtomicU32,
218}
219
220impl ConcurrentStringInterner {
221    /// Create a new empty concurrent string interner
222    pub fn new() -> Self {
223        Self {
224            shards: std::array::from_fn(|_| RwLock::new(HashMap::new())),
225            symbols: RwLock::new(Vec::new()),
226            next_symbol: AtomicU32::new(0),
227        }
228    }
229
230    /// Create a new concurrent string interner with pre-allocated capacity
231    pub fn with_capacity(capacity: usize) -> Self {
232        let per_shard = capacity / SHARD_COUNT + 1;
233        Self {
234            shards: std::array::from_fn(|_| RwLock::new(HashMap::with_capacity(per_shard))),
235            symbols: RwLock::new(Vec::with_capacity(capacity)),
236            next_symbol: AtomicU32::new(0),
237        }
238    }
239
240    /// Get the shard index for a string
241    #[inline]
242    fn shard_for(&self, s: &str) -> usize {
243        // Simple FNV-1a hash for shard selection
244        let mut hash: u64 = 0xcbf29ce484222325;
245        for byte in s.bytes() {
246            hash ^= byte as u64;
247            hash = hash.wrapping_mul(0x100000001b3);
248        }
249        (hash as usize) % SHARD_COUNT
250    }
251
252    /// Intern a string and return its symbol
253    pub fn get_or_intern(&self, s: &str) -> Symbol {
254        let shard_idx = self.shard_for(s);
255
256        // Try read-only lookup first (fast path)
257        {
258            let shard = self.shards[shard_idx].read();
259            if let Some(&symbol) = shard.get(s) {
260                return symbol;
261            }
262        }
263
264        // Need to insert - acquire write lock
265        let mut shard = self.shards[shard_idx].write();
266
267        // Double-check after acquiring write lock (another thread may have inserted)
268        if let Some(&symbol) = shard.get(s) {
269            return symbol;
270        }
271
272        // Allocate new symbol
273        let symbol = Symbol(self.next_symbol.fetch_add(1, Ordering::SeqCst));
274
275        // Insert into symbol table (under separate lock)
276        {
277            let mut symbols = self.symbols.write();
278            // Ensure symbols vector has space
279            while symbols.len() <= symbol.0 as usize {
280                symbols.push(String::new());
281            }
282            symbols[symbol.0 as usize] = s.to_string();
283        }
284
285        // Insert into shard
286        shard.insert(s.to_string(), symbol);
287
288        symbol
289    }
290
291    /// Look up an already-interned string
292    pub fn get(&self, s: &str) -> Option<Symbol> {
293        let shard_idx = self.shard_for(s);
294        let shard = self.shards[shard_idx].read();
295        shard.get(s).copied()
296    }
297
298    /// Resolve a symbol to its string
299    pub fn resolve(&self, symbol: Symbol) -> Option<String> {
300        let symbols = self.symbols.read();
301        symbols.get(symbol.0 as usize).cloned()
302    }
303
304    /// Get the number of interned strings
305    pub fn len(&self) -> usize {
306        self.next_symbol.load(Ordering::Relaxed) as usize
307    }
308
309    /// Check if the interner is empty
310    pub fn is_empty(&self) -> bool {
311        self.len() == 0
312    }
313
314    /// Get memory usage estimate in bytes
315    pub fn memory_usage(&self) -> usize {
316        let symbols = self.symbols.read();
317
318        // String storage
319        let string_storage: usize = symbols
320            .iter()
321            .map(|s| s.len() + std::mem::size_of::<String>())
322            .sum();
323
324        // Shard overhead (rough estimate)
325        let shard_overhead =
326            SHARD_COUNT * 3 * std::mem::size_of::<usize>() * self.len() / SHARD_COUNT;
327
328        string_storage + shard_overhead
329    }
330}
331
332impl Default for ConcurrentStringInterner {
333    fn default() -> Self {
334        Self::new()
335    }
336}
337
338// Make ConcurrentStringInterner Send + Sync
339unsafe impl Send for ConcurrentStringInterner {}
340unsafe impl Sync for ConcurrentStringInterner {}
341
342/// Global string interner for path segments
343///
344/// This is a convenience wrapper around ConcurrentStringInterner
345/// that provides a global instance for interning path segments.
346pub mod global {
347    use super::*;
348    use std::sync::OnceLock;
349
350    static GLOBAL_INTERNER: OnceLock<ConcurrentStringInterner> = OnceLock::new();
351
352    /// Get the global string interner
353    pub fn interner() -> &'static ConcurrentStringInterner {
354        GLOBAL_INTERNER.get_or_init(ConcurrentStringInterner::new)
355    }
356
357    /// Intern a string in the global interner
358    pub fn intern(s: &str) -> Symbol {
359        interner().get_or_intern(s)
360    }
361
362    /// Resolve a symbol from the global interner
363    pub fn resolve(symbol: Symbol) -> Option<String> {
364        interner().resolve(symbol)
365    }
366
367    /// Get memory usage of the global interner
368    pub fn memory_usage() -> usize {
369        interner().memory_usage()
370    }
371}
372
373#[cfg(test)]
374mod tests {
375    use super::*;
376    use std::thread;
377
378    #[test]
379    fn test_string_interner_basic() {
380        let mut interner = StringInterner::new();
381
382        let sym1 = interner.get_or_intern("hello");
383        let sym2 = interner.get_or_intern("world");
384        let sym3 = interner.get_or_intern("hello"); // Same as sym1
385
386        assert_eq!(sym1, sym3);
387        assert_ne!(sym1, sym2);
388
389        assert_eq!(interner.resolve(sym1), Some("hello"));
390        assert_eq!(interner.resolve(sym2), Some("world"));
391    }
392
393    #[test]
394    fn test_string_interner_get() {
395        let mut interner = StringInterner::new();
396
397        assert_eq!(interner.get("hello"), None);
398
399        let sym = interner.get_or_intern("hello");
400        assert_eq!(interner.get("hello"), Some(sym));
401    }
402
403    #[test]
404    fn test_string_interner_owned() {
405        let mut interner = StringInterner::new();
406
407        let sym1 = interner.get_or_intern_owned("hello".to_string());
408        let sym2 = interner.get_or_intern_owned("hello".to_string());
409
410        assert_eq!(sym1, sym2);
411    }
412
413    #[test]
414    fn test_string_interner_len() {
415        let mut interner = StringInterner::new();
416
417        assert_eq!(interner.len(), 0);
418        assert!(interner.is_empty());
419
420        interner.get_or_intern("hello");
421        assert_eq!(interner.len(), 1);
422
423        interner.get_or_intern("world");
424        assert_eq!(interner.len(), 2);
425
426        interner.get_or_intern("hello"); // Duplicate
427        assert_eq!(interner.len(), 2);
428    }
429
430    #[test]
431    fn test_concurrent_interner_basic() {
432        let interner = ConcurrentStringInterner::new();
433
434        let sym1 = interner.get_or_intern("hello");
435        let sym2 = interner.get_or_intern("world");
436        let sym3 = interner.get_or_intern("hello");
437
438        assert_eq!(sym1, sym3);
439        assert_ne!(sym1, sym2);
440
441        assert_eq!(interner.resolve(sym1), Some("hello".to_string()));
442        assert_eq!(interner.resolve(sym2), Some("world".to_string()));
443    }
444
445    #[test]
446    fn test_concurrent_interner_threaded() {
447        use std::sync::Arc;
448
449        let interner = Arc::new(ConcurrentStringInterner::new());
450        let mut handles = vec![];
451
452        // Spawn multiple threads interning overlapping strings
453        for _i in 0..8 {
454            let interner = Arc::clone(&interner);
455            let handle = thread::spawn(move || {
456                for j in 0..100 {
457                    let s = format!("string_{}", j % 50);
458                    interner.get_or_intern(&s);
459                }
460            });
461            handles.push(handle);
462        }
463
464        for handle in handles {
465            handle.join().unwrap();
466        }
467
468        // Should have 50 unique strings
469        assert_eq!(interner.len(), 50);
470    }
471
472    #[test]
473    fn test_path_segment_interning() {
474        let mut interner = StringInterner::new();
475
476        // Simulate interning path segments
477        let paths = [
478            "users.profile.settings.theme",
479            "users.profile.settings.language",
480            "users.profile.name",
481            "products.inventory.count",
482            "products.inventory.location",
483        ];
484
485        for path in &paths {
486            for segment in path.split('.') {
487                interner.get_or_intern(segment);
488            }
489        }
490
491        // Count unique segments
492        // users, profile, settings, theme, language, name, products, inventory, count, location = 10
493        assert_eq!(interner.len(), 10);
494    }
495
496    #[test]
497    fn test_symbol_serialization() {
498        let mut interner = StringInterner::new();
499
500        let sym = interner.get_or_intern("test");
501        let raw = sym.as_raw();
502        let sym2 = Symbol::from_raw(raw);
503
504        assert_eq!(sym, sym2);
505    }
506
507    #[test]
508    fn test_global_interner() {
509        let sym1 = global::intern("global_test");
510        let sym2 = global::intern("global_test");
511
512        assert_eq!(sym1, sym2);
513        assert_eq!(global::resolve(sym1), Some("global_test".to_string()));
514    }
515
516    #[test]
517    fn test_memory_usage() {
518        let mut interner = StringInterner::new();
519
520        for i in 0..1000 {
521            interner.get_or_intern(&format!("string_{}", i));
522        }
523
524        let usage = interner.memory_usage();
525        // Should be reasonable - less than 100KB for 1000 short strings
526        assert!(usage < 100_000, "Memory usage too high: {}", usage);
527    }
528
529    #[test]
530    fn test_empty_string() {
531        let mut interner = StringInterner::new();
532
533        let sym = interner.get_or_intern("");
534        assert_eq!(interner.resolve(sym), Some(""));
535    }
536
537    #[test]
538    fn test_unicode_strings() {
539        let mut interner = StringInterner::new();
540
541        let sym1 = interner.get_or_intern("こんにちは");
542        let sym2 = interner.get_or_intern("世界");
543        let sym3 = interner.get_or_intern("こんにちは");
544
545        assert_eq!(sym1, sym3);
546        assert_ne!(sym1, sym2);
547
548        assert_eq!(interner.resolve(sym1), Some("こんにちは"));
549        assert_eq!(interner.resolve(sym2), Some("世界"));
550    }
551}