use crate::core::hasher::IdentityHasher;
use dashmap::DashMap;
use std::fmt;
use std::hash::BuildHasherDefault;
use std::sync::Arc;
use std::sync::atomic::{AtomicU32, Ordering};
pub const DEFAULT_MAX_INTERNED_STRINGS: usize = 100_000;
const MAX_SPIN_WAIT_ITERATIONS: usize = 100_000;
pub const COMMON_STRINGS: &[&str] = &[
"name",
"id",
"type",
"label",
"created_at",
"updated_at",
"valid_from",
"valid_to",
"tx_from",
"tx_to",
];
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct InternedString(u32);
impl InternedString {
#[inline]
pub const fn from_raw(id: u32) -> Self {
InternedString(id)
}
#[inline]
pub const fn as_u32(self) -> u32 {
self.0
}
}
impl fmt::Display for InternedString {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let result = GLOBAL_INTERNER.resolve_with(*self, |s| write!(f, "{}", s));
match result {
Some(res) => res,
None => write!(f, "Interned({})", self.0),
}
}
}
impl fmt::Debug for InternedString {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let result =
GLOBAL_INTERNER.resolve_with(*self, |s| write!(f, "InternedString(\"{}\")", s));
match result {
Some(res) => res,
None => write!(f, "InternedString({})", self.0),
}
}
}
pub struct StringInterner {
string_to_id: DashMap<Arc<str>, InternedString>,
id_to_string: DashMap<InternedString, Arc<str>, BuildHasherDefault<IdentityHasher>>,
next_id: AtomicU32,
max_capacity: usize,
}
impl StringInterner {
pub fn new() -> Self {
Self::with_max_capacity(DEFAULT_MAX_INTERNED_STRINGS)
}
pub fn with_max_capacity(max_capacity: usize) -> Self {
StringInterner {
string_to_id: DashMap::new(),
id_to_string: DashMap::with_hasher(BuildHasherDefault::default()),
next_id: AtomicU32::new(0),
max_capacity,
}
}
pub fn intern<S: AsRef<str>>(
&self,
string: S,
) -> std::result::Result<InternedString, crate::core::error::Error> {
let string = string.as_ref();
if let Some(id) = self.get_id(string) {
return Ok(id);
}
let arc_str: Arc<str> = Arc::from(string);
self.string_to_id
.entry(arc_str.clone())
.or_try_insert_with(|| {
let id_value = self.next_id.fetch_add(1, Ordering::Relaxed);
if id_value >= self.max_capacity as u32 {
self.next_id.fetch_sub(1, Ordering::Relaxed);
return Err(crate::core::error::Error::Storage(
crate::core::error::StorageError::CapacityExceeded {
resource: "string interner".to_string(),
current: id_value as usize,
limit: self.max_capacity,
},
));
}
let id = InternedString(id_value);
self.id_to_string.insert(id, arc_str.clone());
Ok(id)
})
.map(|r| *r)
}
#[inline]
#[allow(dead_code)] pub(crate) fn intern_unchecked<S: AsRef<str>>(&self, string: S) -> InternedString {
let string = string.as_ref();
if let Some(id) = self.get_id(string) {
return id;
}
let arc_str: Arc<str> = Arc::from(string);
*self.string_to_id.entry(arc_str.clone()).or_insert_with(|| {
let id_value = self.next_id.fetch_add(1, Ordering::Relaxed);
let id = InternedString(id_value);
self.id_to_string.insert(id, arc_str.clone());
id
})
}
#[deprecated(
since = "0.1.0",
note = "Use resolve_with() for read-only access; use resolve() only when an owned Arc<str> is strictly required"
)]
pub fn resolve(&self, id: InternedString) -> Option<Arc<str>> {
self.id_to_string
.get(&id)
.map(|entry| Arc::clone(entry.value()))
}
pub fn get(&self, id: InternedString) -> Option<impl AsRef<str> + '_> {
self.id_to_string.get(&id).map(|entry| {
let arc: Arc<str> = Arc::clone(entry.value());
arc
})
}
pub fn resolve_with<F, R>(&self, id: InternedString, f: F) -> Option<R>
where
F: FnOnce(&str) -> R,
{
self.id_to_string
.get(&id)
.map(|entry| f(entry.value().as_ref()))
}
#[deprecated(since = "0.1.0", note = "Use resolve_with() instead")]
pub fn with_str<F, R>(&self, id: InternedString, f: F) -> Option<R>
where
F: FnOnce(&str) -> R,
{
self.resolve_with(id, f)
}
pub fn contains<S: AsRef<str>>(&self, string: S) -> bool {
self.string_to_id.contains_key(string.as_ref())
}
pub fn get_id<S: AsRef<str>>(&self, string: S) -> Option<InternedString> {
self.string_to_id
.get(string.as_ref())
.map(|entry| *entry.value())
}
pub fn len(&self) -> usize {
self.string_to_id.len()
}
pub fn is_empty(&self) -> bool {
self.string_to_id.is_empty()
}
pub fn clear(&self) {
self.string_to_id.clear();
self.id_to_string.clear();
self.next_id.store(0, Ordering::Relaxed);
}
pub fn get_all_strings(&self) -> Vec<String> {
let max_id = self.next_id.load(Ordering::Relaxed) as usize;
let mut strings: Vec<Option<String>> = vec![None; max_id];
for entry in self.id_to_string.iter() {
let id = entry.key().as_u32() as usize;
if id < strings.len() {
strings[id] = Some(entry.value().to_string());
} else {
strings.resize(id + 1, None);
strings[id] = Some(entry.value().to_string());
}
}
for (id, val) in strings.iter_mut().enumerate() {
if val.is_none() {
let mut spins = 0;
loop {
let current_max_id = self.next_id.load(Ordering::Relaxed) as usize;
if id >= current_max_id {
break;
}
if let Some(entry) = self.id_to_string.get(&InternedString::from_raw(id as u32))
{
*val = Some(entry.value().to_string());
break;
}
std::thread::yield_now();
spins += 1;
if spins > MAX_SPIN_WAIT_ITERATIONS {
panic!(
"Deadlock detected in get_all_strings: ID {} never appeared after {} spins. \
This indicates a bug in the interner logic (e.g. rolled back ID not properly handled).",
id, MAX_SPIN_WAIT_ITERATIONS
);
}
}
}
}
strings.into_iter().flatten().collect()
}
pub fn warm_common_strings(&self) {
for s in COMMON_STRINGS {
let _id = self.intern_unchecked(s);
}
}
}
impl Default for StringInterner {
fn default() -> Self {
Self::new()
}
}
use std::sync::LazyLock;
pub const MAX_INTERNED_STRINGS_ENV: &str = "ALETHEIADB_MAX_INTERNED_STRINGS";
pub static GLOBAL_INTERNER: LazyLock<StringInterner> = LazyLock::new(|| {
let max_capacity = std::env::var(MAX_INTERNED_STRINGS_ENV)
.ok()
.and_then(|s| s.parse::<usize>().ok())
.unwrap_or(DEFAULT_MAX_INTERNED_STRINGS);
let interner = StringInterner::with_max_capacity(max_capacity);
interner.warm_common_strings();
interner
});
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_intern_same_string() {
let interner = StringInterner::new();
let id1 = interner.intern("hello").unwrap();
let id2 = interner.intern("hello").unwrap();
assert_eq!(id1, id2, "Same string should get same ID");
}
#[test]
fn test_intern_different_strings() {
let interner = StringInterner::new();
let id1 = interner.intern("hello").unwrap();
let id2 = interner.intern("world").unwrap();
assert_ne!(id1, id2, "Different strings should get different IDs");
}
#[test]
#[allow(deprecated)]
fn test_resolve() {
let interner = StringInterner::new();
let id = interner.intern("test").unwrap();
let resolved = interner.resolve(id).expect("Should resolve");
assert_eq!(resolved.as_ref(), "test");
}
#[test]
#[allow(deprecated)]
fn test_resolve_invalid_id() {
let interner = StringInterner::new();
let invalid_id = InternedString::from_raw(999);
assert!(interner.resolve(invalid_id).is_none());
}
#[test]
fn test_contains() {
let interner = StringInterner::new();
assert!(!interner.contains("test"));
interner.intern("test").unwrap();
assert!(interner.contains("test"));
assert!(!interner.contains("other"));
}
#[test]
fn test_get_id() {
let interner = StringInterner::new();
assert_eq!(interner.get_id("test"), None);
let id = interner.intern("test").unwrap();
assert_eq!(interner.get_id("test"), Some(id));
}
#[test]
fn test_len() {
let interner = StringInterner::new();
assert_eq!(interner.len(), 0);
assert!(interner.is_empty());
interner.intern("a").unwrap();
interner.intern("b").unwrap();
interner.intern("a").unwrap();
assert_eq!(interner.len(), 2);
assert!(!interner.is_empty());
}
#[test]
#[allow(deprecated)]
fn test_clear() {
let interner = StringInterner::new();
let id = interner.intern("test").unwrap();
assert!(interner.resolve(id).is_some());
interner.clear();
assert_eq!(interner.len(), 0);
assert!(interner.resolve(id).is_none());
}
#[test]
fn test_concurrent_interning() {
use std::thread;
let interner = Arc::new(StringInterner::new());
let mut handles = vec![];
for _ in 0..10 {
let interner_clone = Arc::clone(&interner);
let handle = thread::spawn(move || {
let id1 = interner_clone.intern("concurrent").unwrap();
let id2 = interner_clone.intern("test").unwrap();
(id1, id2)
});
handles.push(handle);
}
let results: Vec<_> = handles.into_iter().map(|h| h.join().unwrap()).collect();
let (first_id1, first_id2) = results[0];
for (id1, id2) in results.iter().skip(1) {
assert_eq!(*id1, first_id1);
assert_eq!(*id2, first_id2);
}
assert_eq!(interner.len(), 2);
}
#[test]
fn test_interned_string_size() {
use std::mem::size_of;
assert_eq!(size_of::<InternedString>(), 4);
assert_eq!(size_of::<String>(), 24);
println!("InternedString: {} bytes", size_of::<InternedString>());
println!("String: {} bytes", size_of::<String>());
}
#[test]
#[allow(deprecated)]
fn test_global_interner() {
let id1 = GLOBAL_INTERNER.intern("global").unwrap();
let id2 = GLOBAL_INTERNER.intern("global").unwrap();
assert_eq!(id1, id2);
let resolved = GLOBAL_INTERNER.resolve(id1).unwrap();
assert_eq!(resolved.as_ref(), "global");
}
#[test]
fn test_resolve_with_basic() {
let interner = StringInterner::new();
let id = interner.intern("hello").unwrap();
let result = interner.resolve_with(id, |s| {
assert_eq!(s, "hello");
s.len()
});
assert_eq!(result, Some(5));
}
#[test]
fn test_resolve_with_invalid_id() {
let interner = StringInterner::new();
let invalid_id = InternedString::from_raw(999);
let result = interner.resolve_with(invalid_id, |s| s.len());
assert_eq!(result, None);
}
#[test]
fn test_resolve_with_return_types() {
let interner = StringInterner::new();
let id = interner.intern("test string").unwrap();
let len = interner.resolve_with(id, |s| s.len()).unwrap();
assert_eq!(len, 11);
let uppercase = interner.resolve_with(id, |s| s.to_uppercase()).unwrap();
assert_eq!(uppercase, "TEST STRING");
let contains = interner.resolve_with(id, |s| s.contains("test")).unwrap();
assert!(contains);
let words: Vec<String> = interner
.resolve_with(id, |s| {
s.split_whitespace().map(|w| w.to_string()).collect()
})
.unwrap();
assert_eq!(words, vec!["test", "string"]);
}
#[test]
#[allow(deprecated)]
fn test_resolve_with_no_arc_clone() {
let interner = StringInterner::new();
let id = interner.intern("performance test").unwrap();
let s_arc = interner.resolve(id).unwrap();
let baseline_count = Arc::strong_count(&s_arc);
let mut call_count = 0;
let result = interner.resolve_with(id, |s| {
call_count += 1;
assert_eq!(Arc::strong_count(&s_arc), baseline_count);
s.to_string()
});
assert_eq!(Arc::strong_count(&s_arc), baseline_count);
assert_eq!(result, Some("performance test".to_string()));
assert_eq!(call_count, 1);
}
#[test]
fn test_resolve_with_concurrent() {
use std::thread;
let interner = Arc::new(StringInterner::new());
let id = interner.intern("concurrent").unwrap();
let results: Vec<String> = thread::scope(|s| {
let mut handles = Vec::new();
for i in 0..10 {
let interner_clone = Arc::clone(&interner);
handles.push(s.spawn(move || {
interner_clone
.resolve_with(id, |s| {
assert_eq!(s, "concurrent");
format!("{}-{}", s, i)
})
.unwrap()
}));
}
handles.into_iter().map(|h| h.join().unwrap()).collect()
});
for (i, result) in results.iter().enumerate() {
assert_eq!(result, &format!("concurrent-{}", i));
}
}
#[test]
#[allow(deprecated)]
fn test_resolve_with_vs_resolve_equivalence() {
let interner = StringInterner::new();
let id = interner.intern("equivalence test").unwrap();
let via_resolve_with = interner.resolve_with(id, |s| s.to_string()).unwrap();
let via_resolve = interner.resolve(id).unwrap();
assert_eq!(via_resolve_with, via_resolve.as_ref());
}
#[test]
fn test_resolve_with_empty_string() {
let interner = StringInterner::new();
let id = interner.intern("").unwrap();
let len = interner.resolve_with(id, |s| s.len()).unwrap();
assert_eq!(len, 0);
let is_empty = interner.resolve_with(id, |s| s.is_empty()).unwrap();
assert!(is_empty);
}
#[test]
fn test_resolve_with_panic_safety() {
let interner = StringInterner::new();
let id = interner.intern("panic test").unwrap();
let before = interner.resolve_with(id, |s| s.to_string()).unwrap();
assert_eq!(before, "panic test");
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
interner.resolve_with(id, |_s| {
panic!("intentional panic in callback");
})
}));
assert!(result.is_err());
let after = interner.resolve_with(id, |s| s.to_string()).unwrap();
assert_eq!(after, "panic test");
let new_id = interner.intern("after panic").unwrap();
let new_str = interner.resolve_with(new_id, |s| s.to_string()).unwrap();
assert_eq!(new_str, "after panic");
}
#[test]
fn test_intern_capacity_exceeded_error() -> crate::core::error::Result<()> {
let interner = StringInterner::with_max_capacity(10);
for i in 0..10 {
interner.intern(format!("string_{}", i))?;
}
assert_eq!(interner.len(), 10);
let result = interner.intern("overflow");
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
matches!(
err,
crate::core::error::Error::Storage(
crate::core::error::StorageError::CapacityExceeded { .. }
)
),
"Expected CapacityExceeded error, got: {:?}",
err
);
assert_eq!(interner.len(), 10);
assert!(!interner.contains("overflow"));
Ok(())
}
#[test]
fn test_intern_concurrent_capacity_race() {
use std::sync::{Arc, Barrier};
use std::thread;
let interner = Arc::new(StringInterner::with_max_capacity(100));
let barrier = Arc::new(Barrier::new(10));
let mut handles = vec![];
for thread_id in 0..10 {
let interner = Arc::clone(&interner);
let barrier = Arc::clone(&barrier);
let handle = thread::spawn(move || {
barrier.wait();
let mut success_count = 0;
for i in 0..20 {
let string = format!("thread_{}_string_{}", thread_id, i);
if interner.intern(&string).is_ok() {
success_count += 1;
}
}
success_count
});
handles.push(handle);
}
let total_successes: usize = handles
.into_iter()
.map(|h: std::thread::JoinHandle<usize>| h.join().unwrap())
.sum();
assert_eq!(total_successes, 100);
assert_eq!(interner.len(), 100);
}
#[test]
fn test_concurrent_intern_same_string_deduplication() {
use std::sync::Arc;
use std::thread;
let interner = Arc::new(StringInterner::new());
let mut handles = vec![];
for _ in 0..100 {
let interner = Arc::clone(&interner);
let handle = thread::spawn(move || interner.intern("concurrent").unwrap());
handles.push(handle);
}
let ids: Vec<_> = handles
.into_iter()
.map(|h: std::thread::JoinHandle<InternedString>| h.join().unwrap())
.collect();
let first_id = ids[0];
for id in &ids[1..] {
assert_eq!(*id, first_id);
}
assert_eq!(interner.len(), 1);
}
#[test]
fn test_warm_common_strings_basic() {
let interner = StringInterner::new();
assert_eq!(interner.len(), 0);
interner.warm_common_strings();
assert!(!interner.is_empty());
for s in COMMON_STRINGS {
assert!(
interner.contains(s),
"Common string '{}' should be interned after warming",
s
);
}
assert_eq!(interner.len(), COMMON_STRINGS.len());
}
#[test]
fn test_warm_common_strings_idempotent() {
let interner = StringInterner::new();
interner.warm_common_strings();
let len_after_first = interner.len();
let id_name_1 = interner.get_id("name").unwrap();
let id_type_1 = interner.get_id("type").unwrap();
interner.warm_common_strings();
let len_after_second = interner.len();
assert_eq!(len_after_first, len_after_second);
let id_name_2 = interner.get_id("name").unwrap();
let id_type_2 = interner.get_id("type").unwrap();
assert_eq!(id_name_1, id_name_2);
assert_eq!(id_type_1, id_type_2);
}
#[test]
fn test_warm_common_strings_no_allocation_on_subsequent_access() {
let interner = StringInterner::new();
interner.warm_common_strings();
let id_before = interner.get_id("name").unwrap();
let id_after = interner.intern("name").unwrap();
assert_eq!(id_before, id_after);
let expected_len = interner.len();
interner.intern("type").unwrap();
interner.intern("label").unwrap();
assert_eq!(interner.len(), expected_len);
}
#[test]
fn test_warm_common_strings_sequential_ids() {
let interner = StringInterner::new();
interner.warm_common_strings();
let mut ids: Vec<_> = COMMON_STRINGS
.iter()
.map(|s| interner.get_id(s).unwrap().as_u32())
.collect();
ids.sort();
for (i, id) in ids.iter().enumerate() {
assert_eq!(*id, i as u32);
}
}
#[test]
#[allow(deprecated)]
fn test_warm_common_strings_with_existing_data() {
let interner = StringInterner::new();
let existing_id = interner.intern("existing").unwrap();
interner.warm_common_strings();
assert_eq!(interner.resolve(existing_id).unwrap().as_ref(), "existing");
assert!(interner.contains("name"));
assert!(interner.contains("type"));
assert_eq!(interner.len(), 11);
}
#[test]
fn test_warm_common_strings_concurrent() {
use std::sync::Arc;
use std::thread;
let interner = Arc::new(StringInterner::new());
let mut handles = vec![];
for _ in 0..10 {
let interner_clone = Arc::clone(&interner);
let handle = thread::spawn(move || {
interner_clone.warm_common_strings();
});
handles.push(handle);
}
for handle in handles {
handle.join().unwrap();
}
assert_eq!(interner.len(), COMMON_STRINGS.len());
for s in COMMON_STRINGS {
assert!(interner.contains(s));
}
}
#[test]
fn test_warm_common_strings_performance_benefit() {
let interner = StringInterner::new();
interner.warm_common_strings();
for s in COMMON_STRINGS {
let id = interner.get_id(s);
assert!(id.is_some(), "String '{}' should be pre-interned", s);
let id_from_intern = interner.intern(s).unwrap();
assert_eq!(id.unwrap(), id_from_intern);
}
}
#[test]
fn test_global_interner_automatically_warmed() {
for s in COMMON_STRINGS {
assert!(
GLOBAL_INTERNER.contains(s),
"Global interner should have '{}' pre-warmed",
s
);
}
assert!(
GLOBAL_INTERNER.len() >= COMMON_STRINGS.len(),
"Global interner should have at least {} strings, has {}",
COMMON_STRINGS.len(),
GLOBAL_INTERNER.len()
);
}
#[test]
fn test_display_impl() {
let s = "display_test_string";
let id = GLOBAL_INTERNER.intern(s).unwrap();
assert_eq!(format!("{}", id), s);
let raw_id = 1_000_000_000;
let id = InternedString::from_raw(raw_id);
assert_eq!(format!("{}", id), format!("Interned({})", raw_id));
}
}
#[cfg(test)]
mod mutant_kill_tests {
use super::*;
use std::sync::Arc;
#[test]
fn test_with_max_capacity_completes_and_honors_limit_via_subprocess() {
use std::process::Command;
use std::time::{Duration, Instant};
let exe = std::env::current_exe().expect("failed to locate current test binary");
let mut child = Command::new(exe)
.args([
"--ignored",
"--exact",
"core::interning::mutant_kill_tests::test_with_max_capacity_subprocess_helper",
])
.spawn()
.expect("failed to spawn subprocess for with_max_capacity test");
let deadline = Instant::now() + Duration::from_secs(10);
loop {
match child.try_wait() {
Ok(Some(status)) => {
assert!(
status.success(),
"subprocess helper failed for StringInterner::with_max_capacity"
);
break;
}
Ok(None) => {
if Instant::now() >= deadline {
let _ = child.kill();
let _ = child.wait();
panic!("StringInterner::with_max_capacity did not complete");
}
std::thread::sleep(Duration::from_millis(10));
}
Err(e) => panic!("failed while polling subprocess: {e}"),
}
}
}
#[test]
#[ignore]
fn test_with_max_capacity_subprocess_helper() {
let interner = StringInterner::with_max_capacity(3);
assert_eq!(interner.max_capacity, 3);
interner.intern("a").unwrap();
interner.intern("b").unwrap();
interner.intern("c").unwrap();
assert!(interner.intern("d").is_err());
}
#[test]
#[allow(deprecated)]
fn test_with_str_returns_callback_result_and_none_for_invalid_id() {
let interner = StringInterner::new();
let id = interner.intern("alpha").unwrap();
let len = interner.with_str(id, |s| s.len());
assert_eq!(len, Some(5));
let invalid = interner.with_str(InternedString::from_raw(999_999), |s| s.len());
assert_eq!(invalid, None);
}
#[test]
fn test_get_all_strings_returns_id_ordered_contents() {
let interner = StringInterner::new();
let first = interner.intern("first").unwrap();
let second = interner.intern("second").unwrap();
let third = interner.intern("third").unwrap();
assert_eq!(first.as_u32(), 0);
assert_eq!(second.as_u32(), 1);
assert_eq!(third.as_u32(), 2);
let all = interner.get_all_strings();
assert_eq!(all.len(), interner.len());
assert_eq!(
all,
vec![
"first".to_string(),
"second".to_string(),
"third".to_string()
]
);
}
#[test]
fn test_get_all_strings_no_panic_under_concurrent_inserts() {
let interner = Arc::new(StringInterner::with_max_capacity(200_000));
for i in 0..128 {
interner.intern(format!("seed_{i}")).unwrap();
}
let writer = {
let interner = Arc::clone(&interner);
std::thread::spawn(move || {
for i in 0..20_000 {
let _ = interner.intern(format!("writer_{i}"));
if i % 256 == 0 {
std::thread::yield_now();
}
}
})
};
for _ in 0..10_000 {
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
let _ = interner.get_all_strings();
}));
assert!(
result.is_ok(),
"get_all_strings panicked under concurrent inserts"
);
}
writer.join().unwrap();
}
#[test]
fn test_intern_rollback_on_capacity_exceeded() {
let interner = StringInterner::with_max_capacity(1);
let id_a = interner.intern("A").unwrap();
assert_eq!(id_a.as_u32(), 0);
let res = interner.intern("B");
assert!(res.is_err());
let id_c = interner.intern_unchecked("C");
assert_eq!(
id_c.as_u32(),
1,
"ID sequence should be continuous, failed attempt should rollback next_id"
);
let all = interner.get_all_strings();
assert_eq!(all.len(), 2);
assert_eq!(all[0], "A");
assert_eq!(all[1], "C");
}
}