Skip to main content

sochdb_core/
string_interner.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! String Interning for Path Segments
19//!
20//! This module implements string interning to reduce memory usage when
21//! storing repeated path segments in the PathTrie and other data structures.
22//!
23//! ## Memory Savings
24//!
25//! For paths like "users.profile.settings.theme", the segment "users" may
26//! appear thousands of times across different paths. String interning
27//! ensures each unique segment is stored only once.
28//!
29//! ## Example
30//!
31//! ```rust,ignore
32//! use sochdb_core::string_interner::StringInterner;
33//!
34//! let mut interner = StringInterner::new();
35//!
36//! // Intern a string - returns a small integer symbol
37//! let sym1 = interner.get_or_intern("users");
38//! let sym2 = interner.get_or_intern("users"); // Same symbol returned
39//!
40//! assert_eq!(sym1, sym2); // Same string = same symbol
41//!
42//! // Resolve back to string
43//! let s = interner.resolve(sym1);
44//! assert_eq!(s, Some("users"));
45//! ```
46//!
47//! ## Thread Safety
48//!
49//! The `ConcurrentStringInterner` variant is thread-safe and uses a
50//! sharded lock design for concurrent access.
51
52use parking_lot::RwLock;
53use std::collections::HashMap;
54use std::sync::atomic::{AtomicU32, Ordering};
55
56/// A symbol representing an interned string
57///
58/// This is a small integer that can be used to look up the original string.
59/// Using symbols instead of strings reduces memory usage when the same
60/// string appears multiple times.
61#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
62pub struct Symbol(u32);
63
64impl Symbol {
65    /// Create a symbol from a raw u32 (for deserialization)
66    pub fn from_raw(value: u32) -> Self {
67        Symbol(value)
68    }
69
70    /// Get the raw u32 value (for serialization)
71    pub fn as_raw(&self) -> u32 {
72        self.0
73    }
74}
75
76impl From<u32> for Symbol {
77    fn from(value: u32) -> Self {
78        Symbol(value)
79    }
80}
81
82impl From<Symbol> for u32 {
83    fn from(symbol: Symbol) -> Self {
84        symbol.0
85    }
86}
87
88/// A string interner that stores unique strings and returns symbols
89///
90/// This is the single-threaded version. For concurrent access, use
91/// `ConcurrentStringInterner`.
92#[derive(Debug)]
93pub struct StringInterner {
94    /// Map from string to symbol
95    string_to_symbol: HashMap<String, Symbol>,
96    /// Map from symbol to string (for resolve)
97    symbol_to_string: Vec<String>,
98    /// Next symbol to assign
99    next_symbol: u32,
100}
101
102impl StringInterner {
103    /// Create a new empty string interner
104    pub fn new() -> Self {
105        Self {
106            string_to_symbol: HashMap::new(),
107            symbol_to_string: Vec::new(),
108            next_symbol: 0,
109        }
110    }
111
112    /// Create a new string interner with pre-allocated capacity
113    pub fn with_capacity(capacity: usize) -> Self {
114        Self {
115            string_to_symbol: HashMap::with_capacity(capacity),
116            symbol_to_string: Vec::with_capacity(capacity),
117            next_symbol: 0,
118        }
119    }
120
121    /// Intern a string and return its symbol
122    ///
123    /// If the string is already interned, returns the existing symbol.
124    /// Otherwise, creates a new symbol for this string.
125    pub fn get_or_intern(&mut self, s: &str) -> Symbol {
126        if let Some(&symbol) = self.string_to_symbol.get(s) {
127            return symbol;
128        }
129
130        let symbol = Symbol(self.next_symbol);
131        self.next_symbol += 1;
132
133        self.string_to_symbol.insert(s.to_string(), symbol);
134        self.symbol_to_string.push(s.to_string());
135
136        symbol
137    }
138
139    /// Intern an owned string (avoids clone if string is new)
140    pub fn get_or_intern_owned(&mut self, s: String) -> Symbol {
141        if let Some(&symbol) = self.string_to_symbol.get(&s) {
142            return symbol;
143        }
144
145        let symbol = Symbol(self.next_symbol);
146        self.next_symbol += 1;
147
148        self.symbol_to_string.push(s.clone());
149        self.string_to_symbol.insert(s, symbol);
150
151        symbol
152    }
153
154    /// Look up an already-interned string
155    ///
156    /// Returns None if the string has not been interned
157    pub fn get(&self, s: &str) -> Option<Symbol> {
158        self.string_to_symbol.get(s).copied()
159    }
160
161    /// Resolve a symbol to its string
162    ///
163    /// Returns None if the symbol is invalid
164    pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
165        self.symbol_to_string
166            .get(symbol.0 as usize)
167            .map(|s| s.as_str())
168    }
169
170    /// Get the number of interned strings
171    pub fn len(&self) -> usize {
172        self.symbol_to_string.len()
173    }
174
175    /// Check if the interner is empty
176    pub fn is_empty(&self) -> bool {
177        self.symbol_to_string.is_empty()
178    }
179
180    /// Get memory usage estimate in bytes
181    pub fn memory_usage(&self) -> usize {
182        // HashMap overhead (rough estimate: 3 words per entry)
183        let map_overhead = self.string_to_symbol.len() * (3 * std::mem::size_of::<usize>());
184
185        // String storage
186        let string_storage: usize = self
187            .symbol_to_string
188            .iter()
189            .map(|s| s.len() + std::mem::size_of::<String>())
190            .sum();
191
192        // Vec overhead
193        let vec_overhead = std::mem::size_of::<Vec<String>>()
194            + self.symbol_to_string.capacity() * std::mem::size_of::<String>();
195
196        map_overhead + string_storage + vec_overhead
197    }
198}
199
200impl Default for StringInterner {
201    fn default() -> Self {
202        Self::new()
203    }
204}
205
206/// Number of shards for the concurrent interner
207const SHARD_COUNT: usize = 16;
208
209/// Thread-safe string interner using sharded locks
210///
211/// Uses a simple hash-based sharding to reduce lock contention
212/// when multiple threads are interning strings concurrently.
213#[derive(Debug)]
214pub struct ConcurrentStringInterner {
215    /// Sharded map from string to symbol
216    shards: [RwLock<HashMap<String, Symbol>>; SHARD_COUNT],
217    /// Global symbol table (append-only)
218    symbols: RwLock<Vec<String>>,
219    /// Next symbol to assign (atomic for lock-free reads)
220    next_symbol: AtomicU32,
221}
222
223impl ConcurrentStringInterner {
224    /// Create a new empty concurrent string interner
225    pub fn new() -> Self {
226        Self {
227            shards: std::array::from_fn(|_| RwLock::new(HashMap::new())),
228            symbols: RwLock::new(Vec::new()),
229            next_symbol: AtomicU32::new(0),
230        }
231    }
232
233    /// Create a new concurrent string interner with pre-allocated capacity
234    pub fn with_capacity(capacity: usize) -> Self {
235        let per_shard = capacity / SHARD_COUNT + 1;
236        Self {
237            shards: std::array::from_fn(|_| RwLock::new(HashMap::with_capacity(per_shard))),
238            symbols: RwLock::new(Vec::with_capacity(capacity)),
239            next_symbol: AtomicU32::new(0),
240        }
241    }
242
243    /// Get the shard index for a string
244    #[inline]
245    fn shard_for(&self, s: &str) -> usize {
246        // Simple FNV-1a hash for shard selection
247        let mut hash: u64 = 0xcbf29ce484222325;
248        for byte in s.bytes() {
249            hash ^= byte as u64;
250            hash = hash.wrapping_mul(0x100000001b3);
251        }
252        (hash as usize) % SHARD_COUNT
253    }
254
255    /// Intern a string and return its symbol
256    pub fn get_or_intern(&self, s: &str) -> Symbol {
257        let shard_idx = self.shard_for(s);
258
259        // Try read-only lookup first (fast path)
260        {
261            let shard = self.shards[shard_idx].read();
262            if let Some(&symbol) = shard.get(s) {
263                return symbol;
264            }
265        }
266
267        // Need to insert - acquire write lock
268        let mut shard = self.shards[shard_idx].write();
269
270        // Double-check after acquiring write lock (another thread may have inserted)
271        if let Some(&symbol) = shard.get(s) {
272            return symbol;
273        }
274
275        // Allocate new symbol
276        let symbol = Symbol(self.next_symbol.fetch_add(1, Ordering::SeqCst));
277
278        // Insert into symbol table (under separate lock)
279        {
280            let mut symbols = self.symbols.write();
281            // Ensure symbols vector has space
282            while symbols.len() <= symbol.0 as usize {
283                symbols.push(String::new());
284            }
285            symbols[symbol.0 as usize] = s.to_string();
286        }
287
288        // Insert into shard
289        shard.insert(s.to_string(), symbol);
290
291        symbol
292    }
293
294    /// Look up an already-interned string
295    pub fn get(&self, s: &str) -> Option<Symbol> {
296        let shard_idx = self.shard_for(s);
297        let shard = self.shards[shard_idx].read();
298        shard.get(s).copied()
299    }
300
301    /// Resolve a symbol to its string
302    pub fn resolve(&self, symbol: Symbol) -> Option<String> {
303        let symbols = self.symbols.read();
304        symbols.get(symbol.0 as usize).cloned()
305    }
306
307    /// Get the number of interned strings
308    pub fn len(&self) -> usize {
309        self.next_symbol.load(Ordering::Relaxed) as usize
310    }
311
312    /// Check if the interner is empty
313    pub fn is_empty(&self) -> bool {
314        self.len() == 0
315    }
316
317    /// Get memory usage estimate in bytes
318    pub fn memory_usage(&self) -> usize {
319        let symbols = self.symbols.read();
320
321        // String storage
322        let string_storage: usize = symbols
323            .iter()
324            .map(|s| s.len() + std::mem::size_of::<String>())
325            .sum();
326
327        // Shard overhead (rough estimate)
328        let shard_overhead =
329            SHARD_COUNT * 3 * std::mem::size_of::<usize>() * self.len() / SHARD_COUNT;
330
331        string_storage + shard_overhead
332    }
333}
334
335impl Default for ConcurrentStringInterner {
336    fn default() -> Self {
337        Self::new()
338    }
339}
340
341// Make ConcurrentStringInterner Send + Sync
342unsafe impl Send for ConcurrentStringInterner {}
343unsafe impl Sync for ConcurrentStringInterner {}
344
345/// Global string interner for path segments
346///
347/// This is a convenience wrapper around ConcurrentStringInterner
348/// that provides a global instance for interning path segments.
349pub mod global {
350    use super::*;
351    use std::sync::OnceLock;
352
353    static GLOBAL_INTERNER: OnceLock<ConcurrentStringInterner> = OnceLock::new();
354
355    /// Get the global string interner
356    pub fn interner() -> &'static ConcurrentStringInterner {
357        GLOBAL_INTERNER.get_or_init(ConcurrentStringInterner::new)
358    }
359
360    /// Intern a string in the global interner
361    pub fn intern(s: &str) -> Symbol {
362        interner().get_or_intern(s)
363    }
364
365    /// Resolve a symbol from the global interner
366    pub fn resolve(symbol: Symbol) -> Option<String> {
367        interner().resolve(symbol)
368    }
369
370    /// Get memory usage of the global interner
371    pub fn memory_usage() -> usize {
372        interner().memory_usage()
373    }
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379    use std::thread;
380
381    #[test]
382    fn test_string_interner_basic() {
383        let mut interner = StringInterner::new();
384
385        let sym1 = interner.get_or_intern("hello");
386        let sym2 = interner.get_or_intern("world");
387        let sym3 = interner.get_or_intern("hello"); // Same as sym1
388
389        assert_eq!(sym1, sym3);
390        assert_ne!(sym1, sym2);
391
392        assert_eq!(interner.resolve(sym1), Some("hello"));
393        assert_eq!(interner.resolve(sym2), Some("world"));
394    }
395
396    #[test]
397    fn test_string_interner_get() {
398        let mut interner = StringInterner::new();
399
400        assert_eq!(interner.get("hello"), None);
401
402        let sym = interner.get_or_intern("hello");
403        assert_eq!(interner.get("hello"), Some(sym));
404    }
405
406    #[test]
407    fn test_string_interner_owned() {
408        let mut interner = StringInterner::new();
409
410        let sym1 = interner.get_or_intern_owned("hello".to_string());
411        let sym2 = interner.get_or_intern_owned("hello".to_string());
412
413        assert_eq!(sym1, sym2);
414    }
415
416    #[test]
417    fn test_string_interner_len() {
418        let mut interner = StringInterner::new();
419
420        assert_eq!(interner.len(), 0);
421        assert!(interner.is_empty());
422
423        interner.get_or_intern("hello");
424        assert_eq!(interner.len(), 1);
425
426        interner.get_or_intern("world");
427        assert_eq!(interner.len(), 2);
428
429        interner.get_or_intern("hello"); // Duplicate
430        assert_eq!(interner.len(), 2);
431    }
432
433    #[test]
434    fn test_concurrent_interner_basic() {
435        let interner = ConcurrentStringInterner::new();
436
437        let sym1 = interner.get_or_intern("hello");
438        let sym2 = interner.get_or_intern("world");
439        let sym3 = interner.get_or_intern("hello");
440
441        assert_eq!(sym1, sym3);
442        assert_ne!(sym1, sym2);
443
444        assert_eq!(interner.resolve(sym1), Some("hello".to_string()));
445        assert_eq!(interner.resolve(sym2), Some("world".to_string()));
446    }
447
448    #[test]
449    fn test_concurrent_interner_threaded() {
450        use std::sync::Arc;
451
452        let interner = Arc::new(ConcurrentStringInterner::new());
453        let mut handles = vec![];
454
455        // Spawn multiple threads interning overlapping strings
456        for _i in 0..8 {
457            let interner = Arc::clone(&interner);
458            let handle = thread::spawn(move || {
459                for j in 0..100 {
460                    let s = format!("string_{}", j % 50);
461                    interner.get_or_intern(&s);
462                }
463            });
464            handles.push(handle);
465        }
466
467        for handle in handles {
468            handle.join().unwrap();
469        }
470
471        // Should have 50 unique strings
472        assert_eq!(interner.len(), 50);
473    }
474
475    #[test]
476    fn test_path_segment_interning() {
477        let mut interner = StringInterner::new();
478
479        // Simulate interning path segments
480        let paths = [
481            "users.profile.settings.theme",
482            "users.profile.settings.language",
483            "users.profile.name",
484            "products.inventory.count",
485            "products.inventory.location",
486        ];
487
488        for path in &paths {
489            for segment in path.split('.') {
490                interner.get_or_intern(segment);
491            }
492        }
493
494        // Count unique segments
495        // users, profile, settings, theme, language, name, products, inventory, count, location = 10
496        assert_eq!(interner.len(), 10);
497    }
498
499    #[test]
500    fn test_symbol_serialization() {
501        let mut interner = StringInterner::new();
502
503        let sym = interner.get_or_intern("test");
504        let raw = sym.as_raw();
505        let sym2 = Symbol::from_raw(raw);
506
507        assert_eq!(sym, sym2);
508    }
509
510    #[test]
511    fn test_global_interner() {
512        let sym1 = global::intern("global_test");
513        let sym2 = global::intern("global_test");
514
515        assert_eq!(sym1, sym2);
516        assert_eq!(global::resolve(sym1), Some("global_test".to_string()));
517    }
518
519    #[test]
520    fn test_memory_usage() {
521        let mut interner = StringInterner::new();
522
523        for i in 0..1000 {
524            interner.get_or_intern(&format!("string_{}", i));
525        }
526
527        let usage = interner.memory_usage();
528        // Should be reasonable - less than 100KB for 1000 short strings
529        assert!(usage < 100_000, "Memory usage too high: {}", usage);
530    }
531
532    #[test]
533    fn test_empty_string() {
534        let mut interner = StringInterner::new();
535
536        let sym = interner.get_or_intern("");
537        assert_eq!(interner.resolve(sym), Some(""));
538    }
539
540    #[test]
541    fn test_unicode_strings() {
542        let mut interner = StringInterner::new();
543
544        let sym1 = interner.get_or_intern("こんにちは");
545        let sym2 = interner.get_or_intern("世界");
546        let sym3 = interner.get_or_intern("こんにちは");
547
548        assert_eq!(sym1, sym3);
549        assert_ne!(sym1, sym2);
550
551        assert_eq!(interner.resolve(sym1), Some("こんにちは"));
552        assert_eq!(interner.resolve(sym2), Some("世界"));
553    }
554}