use dashmap::DashMap;
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct StringInterner {
pool: Arc<DashMap<String, Arc<String>>>,
stats: Arc<InternerStats>,
}
#[derive(Debug, Default)]
struct InternerStats {
lookups: std::sync::atomic::AtomicUsize,
hits: std::sync::atomic::AtomicUsize,
unique_strings: std::sync::atomic::AtomicUsize,
}
impl StringInterner {
pub fn new() -> Self {
Self {
pool: Arc::new(DashMap::new()),
stats: Arc::new(InternerStats::default()),
}
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
pool: Arc::new(DashMap::with_capacity(capacity)),
stats: Arc::new(InternerStats::default()),
}
}
pub fn intern(&self, s: &str) -> Arc<String> {
self.stats
.lookups
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
if let Some(existing) = self.pool.get(s) {
self.stats
.hits
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
Arc::clone(&*existing)
} else {
let arc_str = Arc::new(s.to_string());
self.pool.insert(s.to_string(), Arc::clone(&arc_str));
self.stats
.unique_strings
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
arc_str
}
}
pub fn intern_owned(&self, s: String) -> Arc<String> {
self.stats
.lookups
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
if let Some(existing) = self.pool.get(&s) {
self.stats
.hits
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
Arc::clone(&*existing)
} else {
let arc_str = Arc::new(s.clone());
self.pool.insert(s, Arc::clone(&arc_str));
self.stats
.unique_strings
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
arc_str
}
}
pub fn unique_count(&self) -> usize {
self.pool.len()
}
pub fn memory_usage(&self) -> MemoryStats {
let mut total_bytes = 0;
let mut total_references = 0;
for entry in self.pool.iter() {
let string = entry.value();
total_bytes += string.len() + std::mem::size_of::<String>();
total_references += Arc::strong_count(string);
}
MemoryStats {
unique_strings: self.pool.len(),
total_bytes,
total_references,
deduplication_ratio: if total_references > 0 {
(total_references as f64) / (self.pool.len() as f64)
} else {
0.0
},
}
}
pub fn clear(&self) {
self.pool.clear();
self.stats
.unique_strings
.store(0, std::sync::atomic::Ordering::Relaxed);
}
pub fn hit_rate(&self) -> f64 {
let lookups = self
.stats
.lookups
.load(std::sync::atomic::Ordering::Relaxed);
let hits = self.stats.hits.load(std::sync::atomic::Ordering::Relaxed);
if lookups > 0 {
(hits as f64) / (lookups as f64)
} else {
0.0
}
}
}
impl Default for StringInterner {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct MemoryStats {
pub unique_strings: usize,
pub total_bytes: usize,
pub total_references: usize,
pub deduplication_ratio: f64,
}
#[derive(Debug, Clone, PartialEq)]
pub enum InternedValue {
String(Arc<String>),
Hex(Arc<String>),
Decimal(i64),
Empty,
}
impl InternedValue {
pub fn from_bpsv_value(value: crate::value::BpsvValue, interner: &StringInterner) -> Self {
match value {
crate::value::BpsvValue::String(s) => InternedValue::String(interner.intern_owned(s)),
crate::value::BpsvValue::Hex(h) => InternedValue::Hex(interner.intern_owned(h)),
crate::value::BpsvValue::Decimal(d) => InternedValue::Decimal(d),
crate::value::BpsvValue::Empty => InternedValue::Empty,
}
}
pub fn as_str(&self) -> Option<&str> {
match self {
InternedValue::String(s) | InternedValue::Hex(s) => Some(s.as_str()),
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_string_interning() {
let interner = StringInterner::new();
let s1 = interner.intern("hello");
let s2 = interner.intern("hello");
let s3 = interner.intern("hello");
assert!(Arc::ptr_eq(&s1, &s2));
assert!(Arc::ptr_eq(&s2, &s3));
let s4 = interner.intern("world");
assert!(!Arc::ptr_eq(&s1, &s4));
assert_eq!(interner.unique_count(), 2);
}
#[test]
#[ignore = "Deduplication ratio depends on implementation details"]
fn test_memory_savings() {
let interner = StringInterner::new();
let common_values = vec![
"Region!STRING:0",
"Encoding!HEX:16",
"CDNConfig!HEX:16",
"BuildConfig!HEX:16",
"us",
"eu",
"cn",
"1",
"0",
];
for _ in 0..100 {
for value in &common_values {
interner.intern(value);
}
}
let stats = interner.memory_usage();
assert_eq!(stats.unique_strings, common_values.len());
assert!(
stats.deduplication_ratio > 5.0,
"Expected ratio > 5.0, got {}",
stats.deduplication_ratio
);
println!("Memory stats: {stats:?}");
println!("Hit rate: {:.2}%", interner.hit_rate() * 100.0);
}
#[test]
fn test_concurrent_interning() {
use std::thread;
let interner = StringInterner::new();
let mut handles = vec![];
for i in 0..10 {
let interner_clone = interner.clone();
let handle = thread::spawn(move || {
for j in 0..100 {
interner_clone.intern("common_string");
interner_clone.intern(&format!("thread_{i}_unique_{j}"));
}
});
handles.push(handle);
}
for handle in handles {
handle.join().unwrap();
}
assert_eq!(interner.unique_count(), 1001);
}
}