use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::hash::{DefaultHasher, Hash, Hasher};
use std::sync::{LazyLock, RwLock};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
#[repr(transparent)]
pub struct StrKey(u32);
impl StrKey {
pub fn as_u32(self) -> u32 {
self.0
}
}
static GLOBAL_INTERNER: LazyLock<StringInterner> = LazyLock::new(StringInterner::new);
pub fn global_interner() -> &'static StringInterner {
&GLOBAL_INTERNER
}
const INITIAL_CHUNK_CAPACITY: usize = 64 * 1024;
struct InternerInner {
chunks: Vec<String>,
spans: Vec<(u16, u32, u32)>,
map: HashMap<u64, Vec<StrKey>>,
next_chunk_capacity: usize,
}
impl InternerInner {
fn new() -> Self {
let mut inner = Self {
chunks: Vec::new(),
spans: Vec::new(),
map: HashMap::new(),
next_chunk_capacity: INITIAL_CHUNK_CAPACITY,
};
inner
.chunks
.push(String::with_capacity(INITIAL_CHUNK_CAPACITY));
inner
}
fn with_capacity(strings: usize, bytes: usize) -> Self {
let cap = bytes.max(INITIAL_CHUNK_CAPACITY);
let mut inner = Self {
chunks: Vec::new(),
spans: Vec::with_capacity(strings),
map: HashMap::with_capacity(strings),
next_chunk_capacity: cap.checked_next_power_of_two().unwrap_or(cap),
};
inner.chunks.push(String::with_capacity(cap));
inner
}
#[inline]
fn resolve(&self, key: StrKey) -> &str {
let (chunk_idx, start, len) = self.spans[key.0 as usize];
let chunk = &self.chunks[chunk_idx as usize];
&chunk[start as usize..(start + len) as usize]
}
#[inline]
fn hash_str(s: &str) -> u64 {
let mut h = DefaultHasher::new();
s.hash(&mut h);
h.finish()
}
fn get(&self, s: &str) -> Option<StrKey> {
let hash = Self::hash_str(s);
let bucket = self.map.get(&hash)?;
bucket.iter().copied().find(|&key| self.resolve(key) == s)
}
fn intern(&mut self, s: &str) -> StrKey {
let hash = Self::hash_str(s);
if let Some(bucket) = self.map.get(&hash) {
for &key in bucket {
if self.resolve(key) == s {
return key;
}
}
}
let last_idx = self.chunks.len() - 1;
let last = &self.chunks[last_idx];
let chunk_idx = if last.len() + s.len() <= last.capacity() {
last_idx
} else {
let cap = self.next_chunk_capacity.max(s.len());
self.next_chunk_capacity = cap.saturating_mul(2);
self.chunks.push(String::with_capacity(cap));
self.chunks.len() - 1
};
let start = self.chunks[chunk_idx].len() as u32;
self.chunks[chunk_idx].push_str(s);
let len = s.len() as u32;
let key = StrKey(self.spans.len() as u32);
self.spans.push((chunk_idx as u16, start, len));
self.map.entry(hash).or_default().push(key);
key
}
}
pub struct StringInterner {
inner: RwLock<InternerInner>,
}
impl std::fmt::Debug for StringInterner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("StringInterner").finish_non_exhaustive()
}
}
impl Default for StringInterner {
fn default() -> Self {
Self::new()
}
}
impl StringInterner {
pub fn new() -> Self {
Self {
inner: RwLock::new(InternerInner::new()),
}
}
pub fn with_capacity(strings: usize, bytes: usize) -> Self {
Self {
inner: RwLock::new(InternerInner::with_capacity(strings, bytes)),
}
}
#[inline]
pub fn intern(&self, s: &str) -> StrKey {
{
let guard = self.inner.read().unwrap_or_else(|e| e.into_inner());
if let Some(key) = guard.get(s) {
return key;
}
}
let mut guard = self.inner.write().unwrap_or_else(|e| e.into_inner());
guard.intern(s)
}
#[inline]
pub fn resolve(&self, key: StrKey) -> &str {
let guard = self.inner.read().unwrap_or_else(|e| e.into_inner());
let s: &str = guard.resolve(key);
unsafe { std::mem::transmute::<&str, &str>(s) }
}
#[inline]
pub fn get(&self, s: &str) -> Option<StrKey> {
let guard = self.inner.read().unwrap_or_else(|e| e.into_inner());
guard.get(s)
}
pub fn len(&self) -> usize {
self.inner
.read()
.unwrap_or_else(|e| e.into_inner())
.spans
.len()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn memory_usage(&self) -> usize {
let guard = self.inner.read().unwrap_or_else(|e| e.into_inner());
let chunk_bytes: usize = guard.chunks.iter().map(|c| c.capacity()).sum();
let span_bytes = guard.spans.capacity() * std::mem::size_of::<(u16, u32, u32)>();
let map_overhead = guard.map.capacity() * std::mem::size_of::<(u64, Vec<StrKey>)>();
chunk_bytes + span_bytes + map_overhead
}
pub fn empty_key(&self) -> StrKey {
self.intern("")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_interner_deduplication() {
let interner = StringInterner::new();
let k1 = interner.intern("src/main.rs");
let k2 = interner.intern("src/main.rs");
let k3 = interner.intern("src/lib.rs");
assert_eq!(k1, k2);
assert_ne!(k1, k3);
assert_eq!(interner.len(), 2);
}
#[test]
fn test_empty_key() {
let interner = StringInterner::new();
let ek = interner.empty_key();
assert_eq!(interner.resolve(ek), "");
assert_eq!(interner.empty_key(), ek);
}
#[test]
fn test_interner_thread_safety() {
let interner = StringInterner::new();
let key = interner.intern("hello");
std::thread::scope(|s| {
for _ in 0..4 {
s.spawn(|| {
let k = interner.intern("hello");
assert_eq!(k, key);
assert_eq!(interner.resolve(k), "hello");
});
}
});
}
#[test]
fn test_interner_many_strings() {
let interner = StringInterner::new();
let mut keys = Vec::new();
for i in 0..10_000 {
keys.push(interner.intern(&format!("string_{i}")));
}
assert_eq!(interner.len(), 10_000);
for (i, key) in keys.iter().enumerate() {
assert_eq!(interner.resolve(*key), format!("string_{i}"));
}
}
#[test]
fn test_interner_resolve_after_growth() {
let interner = StringInterner::new();
let first_key = interner.intern("first");
for i in 0..10_000 {
interner.intern(&format!("padding_{i}"));
}
assert_eq!(interner.resolve(first_key), "first");
}
}