use lazy_static::lazy_static;
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
lazy_static! {
static ref GLOBAL_INTERNER: Arc<StringInterner> = Arc::new(StringInterner::new());
}
pub fn get_global_interner() -> Arc<StringInterner> {
GLOBAL_INTERNER.clone()
}
pub struct StringInterner {
map: RwLock<HashMap<String, Arc<str>>>,
stats: RwLock<InternerStats>,
}
impl StringInterner {
pub fn new() -> Self {
Self {
map: RwLock::new(HashMap::new()),
stats: RwLock::new(InternerStats::default()),
}
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
map: RwLock::new(HashMap::with_capacity(capacity)),
stats: RwLock::new(InternerStats::default()),
}
}
pub fn intern(&self, s: &str) -> Arc<str> {
{
let map = self.map.read().unwrap();
if let Some(arc) = map.get(s) {
let mut stats = self.stats.write().unwrap();
stats.total_interns += 1;
stats.hits += 1;
return arc.clone();
}
}
let mut map = self.map.write().unwrap();
if let Some(arc) = map.get(s) {
let mut stats = self.stats.write().unwrap();
stats.total_interns += 1;
stats.hits += 1;
return arc.clone();
}
let arc: Arc<str> = Arc::from(s);
map.insert(s.to_string(), arc.clone());
let mut stats = self.stats.write().unwrap();
stats.total_interns += 1;
stats.misses += 1;
stats.unique_strings = map.len();
arc
}
pub fn intern_owned(&self, s: String) -> Arc<str> {
{
let map = self.map.read().unwrap();
if let Some(arc) = map.get(s.as_str()) {
let mut stats = self.stats.write().unwrap();
stats.total_interns += 1;
stats.hits += 1;
return arc.clone();
}
}
let mut map = self.map.write().unwrap();
if let Some(arc) = map.get(s.as_str()) {
let mut stats = self.stats.write().unwrap();
stats.total_interns += 1;
stats.hits += 1;
return arc.clone();
}
let arc: Arc<str> = Arc::from(s.as_str());
map.insert(s, arc.clone());
let mut stats = self.stats.write().unwrap();
stats.total_interns += 1;
stats.misses += 1;
stats.unique_strings = map.len();
arc
}
pub fn stats(&self) -> InternerStats {
self.stats.read().unwrap().clone()
}
pub fn len(&self) -> usize {
self.map.read().unwrap().len()
}
pub fn is_empty(&self) -> bool {
self.map.read().unwrap().is_empty()
}
pub fn clear(&self) {
let mut map = self.map.write().unwrap();
map.clear();
let mut stats = self.stats.write().unwrap();
stats.unique_strings = 0;
}
pub fn estimated_memory_bytes(&self) -> usize {
let map = self.map.read().unwrap();
let mut total = 0;
for (key, _) in map.iter() {
total += 48;
total += key.len();
total += 16;
}
total
}
}
impl Default for StringInterner {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Default)]
pub struct InternerStats {
pub total_interns: usize,
pub hits: usize,
pub misses: usize,
pub unique_strings: usize,
}
impl InternerStats {
pub fn hit_rate(&self) -> f64 {
if self.total_interns == 0 {
0.0
} else {
self.hits as f64 / self.total_interns as f64
}
}
pub fn deduplication_ratio(&self) -> f64 {
if self.unique_strings == 0 {
0.0
} else {
self.total_interns as f64 / self.unique_strings as f64
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_interning() {
let interner = StringInterner::new();
let s1 = interner.intern("hello");
let s2 = interner.intern("hello");
let s3 = interner.intern("world");
assert!(Arc::ptr_eq(&s1, &s2));
assert!(!Arc::ptr_eq(&s1, &s3));
assert_eq!(interner.len(), 2);
}
#[test]
fn test_intern_owned() {
let interner = StringInterner::new();
let s1 = interner.intern_owned("hello".to_string());
let s2 = interner.intern("hello");
assert!(Arc::ptr_eq(&s1, &s2));
assert_eq!(interner.len(), 1);
}
#[test]
fn test_stats() {
let interner = StringInterner::new();
interner.intern("a");
interner.intern("a");
interner.intern("b");
interner.intern("a");
let stats = interner.stats();
assert_eq!(stats.total_interns, 4);
assert_eq!(stats.hits, 2); assert_eq!(stats.misses, 2); assert_eq!(stats.unique_strings, 2);
assert_eq!(stats.hit_rate(), 0.5);
assert_eq!(stats.deduplication_ratio(), 2.0);
}
#[test]
fn test_clear() {
let interner = StringInterner::new();
let s1 = interner.intern("hello");
assert_eq!(interner.len(), 1);
interner.clear();
assert_eq!(interner.len(), 0);
assert_eq!(&*s1, "hello");
let s2 = interner.intern("hello");
assert!(!Arc::ptr_eq(&s1, &s2));
}
#[test]
fn test_with_capacity() {
let interner = StringInterner::with_capacity(100);
assert_eq!(interner.len(), 0);
for i in 0..50 {
interner.intern(&format!("string_{}", i));
}
assert_eq!(interner.len(), 50);
}
#[test]
fn test_memory_estimation() {
let interner = StringInterner::new();
interner.intern("short");
interner.intern("a_longer_string");
let memory = interner.estimated_memory_bytes();
assert!(memory > 0);
assert!(memory > 100); }
#[test]
fn test_global_interner() {
let interner1 = get_global_interner();
let interner2 = get_global_interner();
assert!(Arc::ptr_eq(&interner1, &interner2));
let s1 = interner1.intern("global");
let s2 = interner2.intern("global");
assert!(Arc::ptr_eq(&s1, &s2));
}
#[test]
fn test_empty_string() {
let interner = StringInterner::new();
let s1 = interner.intern("");
let s2 = interner.intern("");
assert!(Arc::ptr_eq(&s1, &s2));
assert_eq!(&*s1, "");
}
#[test]
fn test_unicode_strings() {
let interner = StringInterner::new();
let s1 = interner.intern("你好世界");
let s2 = interner.intern("你好世界");
let s3 = interner.intern("こんにちは");
assert!(Arc::ptr_eq(&s1, &s2));
assert!(!Arc::ptr_eq(&s1, &s3));
}
#[test]
fn test_concurrent_access() {
use std::thread;
let interner = Arc::new(StringInterner::new());
let mut handles = vec![];
for i in 0..10 {
let interner_clone = interner.clone();
let handle = thread::spawn(move || {
for j in 0..100 {
interner_clone.intern(&format!("thread_{}_{}", i, j % 10));
}
});
handles.push(handle);
}
for handle in handles {
handle.join().unwrap();
}
let stats = interner.stats();
assert!(stats.unique_strings <= 100); assert!(stats.total_interns == 1000); }
#[test]
fn test_deduplication_ratio() {
let interner = StringInterner::new();
for _ in 0..100 {
interner.intern("repeated");
}
let stats = interner.stats();
assert_eq!(stats.unique_strings, 1);
assert_eq!(stats.deduplication_ratio(), 100.0);
}
}