use parking_lot::RwLock;
use std::collections::HashMap;
use std::sync::atomic::{AtomicU32, Ordering};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct Symbol(u32);
impl Symbol {
pub fn from_raw(value: u32) -> Self {
Symbol(value)
}
pub fn as_raw(&self) -> u32 {
self.0
}
}
impl From<u32> for Symbol {
fn from(value: u32) -> Self {
Symbol(value)
}
}
impl From<Symbol> for u32 {
fn from(symbol: Symbol) -> Self {
symbol.0
}
}
#[derive(Debug)]
pub struct StringInterner {
string_to_symbol: HashMap<String, Symbol>,
symbol_to_string: Vec<String>,
next_symbol: u32,
}
impl StringInterner {
pub fn new() -> Self {
Self {
string_to_symbol: HashMap::new(),
symbol_to_string: Vec::new(),
next_symbol: 0,
}
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
string_to_symbol: HashMap::with_capacity(capacity),
symbol_to_string: Vec::with_capacity(capacity),
next_symbol: 0,
}
}
pub fn get_or_intern(&mut self, s: &str) -> Symbol {
if let Some(&symbol) = self.string_to_symbol.get(s) {
return symbol;
}
let symbol = Symbol(self.next_symbol);
self.next_symbol += 1;
self.string_to_symbol.insert(s.to_string(), symbol);
self.symbol_to_string.push(s.to_string());
symbol
}
pub fn get_or_intern_owned(&mut self, s: String) -> Symbol {
if let Some(&symbol) = self.string_to_symbol.get(&s) {
return symbol;
}
let symbol = Symbol(self.next_symbol);
self.next_symbol += 1;
self.symbol_to_string.push(s.clone());
self.string_to_symbol.insert(s, symbol);
symbol
}
pub fn get(&self, s: &str) -> Option<Symbol> {
self.string_to_symbol.get(s).copied()
}
pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
self.symbol_to_string
.get(symbol.0 as usize)
.map(|s| s.as_str())
}
pub fn len(&self) -> usize {
self.symbol_to_string.len()
}
pub fn is_empty(&self) -> bool {
self.symbol_to_string.is_empty()
}
pub fn memory_usage(&self) -> usize {
let map_overhead = self.string_to_symbol.len() * (3 * std::mem::size_of::<usize>());
let string_storage: usize = self
.symbol_to_string
.iter()
.map(|s| s.len() + std::mem::size_of::<String>())
.sum();
let vec_overhead = std::mem::size_of::<Vec<String>>()
+ self.symbol_to_string.capacity() * std::mem::size_of::<String>();
map_overhead + string_storage + vec_overhead
}
}
impl Default for StringInterner {
fn default() -> Self {
Self::new()
}
}
const SHARD_COUNT: usize = 16;
#[derive(Debug)]
pub struct ConcurrentStringInterner {
shards: [RwLock<HashMap<String, Symbol>>; SHARD_COUNT],
symbols: RwLock<Vec<String>>,
next_symbol: AtomicU32,
}
impl ConcurrentStringInterner {
pub fn new() -> Self {
Self {
shards: std::array::from_fn(|_| RwLock::new(HashMap::new())),
symbols: RwLock::new(Vec::new()),
next_symbol: AtomicU32::new(0),
}
}
pub fn with_capacity(capacity: usize) -> Self {
let per_shard = capacity / SHARD_COUNT + 1;
Self {
shards: std::array::from_fn(|_| RwLock::new(HashMap::with_capacity(per_shard))),
symbols: RwLock::new(Vec::with_capacity(capacity)),
next_symbol: AtomicU32::new(0),
}
}
#[inline]
fn shard_for(&self, s: &str) -> usize {
let mut hash: u64 = 0xcbf29ce484222325;
for byte in s.bytes() {
hash ^= byte as u64;
hash = hash.wrapping_mul(0x100000001b3);
}
(hash as usize) % SHARD_COUNT
}
pub fn get_or_intern(&self, s: &str) -> Symbol {
let shard_idx = self.shard_for(s);
{
let shard = self.shards[shard_idx].read();
if let Some(&symbol) = shard.get(s) {
return symbol;
}
}
let mut shard = self.shards[shard_idx].write();
if let Some(&symbol) = shard.get(s) {
return symbol;
}
let symbol = Symbol(self.next_symbol.fetch_add(1, Ordering::SeqCst));
{
let mut symbols = self.symbols.write();
while symbols.len() <= symbol.0 as usize {
symbols.push(String::new());
}
symbols[symbol.0 as usize] = s.to_string();
}
shard.insert(s.to_string(), symbol);
symbol
}
pub fn get(&self, s: &str) -> Option<Symbol> {
let shard_idx = self.shard_for(s);
let shard = self.shards[shard_idx].read();
shard.get(s).copied()
}
pub fn resolve(&self, symbol: Symbol) -> Option<String> {
let symbols = self.symbols.read();
symbols.get(symbol.0 as usize).cloned()
}
pub fn len(&self) -> usize {
self.next_symbol.load(Ordering::Relaxed) as usize
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn memory_usage(&self) -> usize {
let symbols = self.symbols.read();
let string_storage: usize = symbols
.iter()
.map(|s| s.len() + std::mem::size_of::<String>())
.sum();
let shard_overhead =
SHARD_COUNT * 3 * std::mem::size_of::<usize>() * self.len() / SHARD_COUNT;
string_storage + shard_overhead
}
}
impl Default for ConcurrentStringInterner {
fn default() -> Self {
Self::new()
}
}
unsafe impl Send for ConcurrentStringInterner {}
unsafe impl Sync for ConcurrentStringInterner {}
pub mod global {
use super::*;
use std::sync::OnceLock;
static GLOBAL_INTERNER: OnceLock<ConcurrentStringInterner> = OnceLock::new();
pub fn interner() -> &'static ConcurrentStringInterner {
GLOBAL_INTERNER.get_or_init(ConcurrentStringInterner::new)
}
pub fn intern(s: &str) -> Symbol {
interner().get_or_intern(s)
}
pub fn resolve(symbol: Symbol) -> Option<String> {
interner().resolve(symbol)
}
pub fn memory_usage() -> usize {
interner().memory_usage()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::thread;
#[test]
fn test_string_interner_basic() {
let mut interner = StringInterner::new();
let sym1 = interner.get_or_intern("hello");
let sym2 = interner.get_or_intern("world");
let sym3 = interner.get_or_intern("hello");
assert_eq!(sym1, sym3);
assert_ne!(sym1, sym2);
assert_eq!(interner.resolve(sym1), Some("hello"));
assert_eq!(interner.resolve(sym2), Some("world"));
}
#[test]
fn test_string_interner_get() {
let mut interner = StringInterner::new();
assert_eq!(interner.get("hello"), None);
let sym = interner.get_or_intern("hello");
assert_eq!(interner.get("hello"), Some(sym));
}
#[test]
fn test_string_interner_owned() {
let mut interner = StringInterner::new();
let sym1 = interner.get_or_intern_owned("hello".to_string());
let sym2 = interner.get_or_intern_owned("hello".to_string());
assert_eq!(sym1, sym2);
}
#[test]
fn test_string_interner_len() {
let mut interner = StringInterner::new();
assert_eq!(interner.len(), 0);
assert!(interner.is_empty());
interner.get_or_intern("hello");
assert_eq!(interner.len(), 1);
interner.get_or_intern("world");
assert_eq!(interner.len(), 2);
interner.get_or_intern("hello"); assert_eq!(interner.len(), 2);
}
#[test]
fn test_concurrent_interner_basic() {
let interner = ConcurrentStringInterner::new();
let sym1 = interner.get_or_intern("hello");
let sym2 = interner.get_or_intern("world");
let sym3 = interner.get_or_intern("hello");
assert_eq!(sym1, sym3);
assert_ne!(sym1, sym2);
assert_eq!(interner.resolve(sym1), Some("hello".to_string()));
assert_eq!(interner.resolve(sym2), Some("world".to_string()));
}
#[test]
fn test_concurrent_interner_threaded() {
use std::sync::Arc;
let interner = Arc::new(ConcurrentStringInterner::new());
let mut handles = vec![];
for _i in 0..8 {
let interner = Arc::clone(&interner);
let handle = thread::spawn(move || {
for j in 0..100 {
let s = format!("string_{}", j % 50);
interner.get_or_intern(&s);
}
});
handles.push(handle);
}
for handle in handles {
handle.join().unwrap();
}
assert_eq!(interner.len(), 50);
}
#[test]
fn test_path_segment_interning() {
let mut interner = StringInterner::new();
let paths = [
"users.profile.settings.theme",
"users.profile.settings.language",
"users.profile.name",
"products.inventory.count",
"products.inventory.location",
];
for path in &paths {
for segment in path.split('.') {
interner.get_or_intern(segment);
}
}
assert_eq!(interner.len(), 10);
}
#[test]
fn test_symbol_serialization() {
let mut interner = StringInterner::new();
let sym = interner.get_or_intern("test");
let raw = sym.as_raw();
let sym2 = Symbol::from_raw(raw);
assert_eq!(sym, sym2);
}
#[test]
fn test_global_interner() {
let sym1 = global::intern("global_test");
let sym2 = global::intern("global_test");
assert_eq!(sym1, sym2);
assert_eq!(global::resolve(sym1), Some("global_test".to_string()));
}
#[test]
fn test_memory_usage() {
let mut interner = StringInterner::new();
for i in 0..1000 {
interner.get_or_intern(&format!("string_{}", i));
}
let usage = interner.memory_usage();
assert!(usage < 100_000, "Memory usage too high: {}", usage);
}
#[test]
fn test_empty_string() {
let mut interner = StringInterner::new();
let sym = interner.get_or_intern("");
assert_eq!(interner.resolve(sym), Some(""));
}
#[test]
fn test_unicode_strings() {
let mut interner = StringInterner::new();
let sym1 = interner.get_or_intern("こんにちは");
let sym2 = interner.get_or_intern("世界");
let sym3 = interner.get_or_intern("こんにちは");
assert_eq!(sym1, sym3);
assert_ne!(sym1, sym2);
assert_eq!(interner.resolve(sym1), Some("こんにちは"));
assert_eq!(interner.resolve(sym2), Some("世界"));
}
}