use std::borrow::Cow;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct StringInterner {
map: HashMap<InternedString, Arc<String>>,
stats: InternerStats,
}
#[derive(Debug, Clone, Default)]
pub struct InternerStats {
pub total_requests: usize,
pub cache_hits: usize,
pub cache_misses: usize,
pub unique_strings: usize,
pub bytes_saved: usize,
}
#[derive(Debug, Clone, Eq)]
struct InternedString(Arc<String>);
impl InternedString {
fn new(s: Arc<String>) -> Self {
Self(s)
}
fn as_str(&self) -> &str {
&self.0
}
}
impl PartialEq for InternedString {
fn eq(&self, other: &Self) -> bool {
self.0.as_str() == other.0.as_str()
}
}
impl Hash for InternedString {
fn hash<H: Hasher>(&self, state: &mut H) {
self.0.as_str().hash(state);
}
}
impl std::borrow::Borrow<str> for InternedString {
fn borrow(&self) -> &str {
self.as_str()
}
}
impl StringInterner {
pub fn new() -> Self {
Self::with_capacity(1024)
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
map: HashMap::with_capacity(capacity),
stats: InternerStats::default(),
}
}
pub fn with_common_namespaces() -> Self {
let mut interner = Self::with_capacity(2048);
let common_namespaces = [
"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"http://www.w3.org/2000/01/rdf-schema#",
"http://www.w3.org/2001/XMLSchema#",
"http://www.w3.org/2002/07/owl#",
"http://xmlns.com/foaf/0.1/",
"http://purl.org/dc/elements/1.1/",
"http://purl.org/dc/terms/",
"http://schema.org/",
"http://www.w3.org/ns/shacl#",
"http://www.w3.org/2004/02/skos/core#",
"http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
"http://www.w3.org/2000/01/rdf-schema#label",
"http://www.w3.org/2000/01/rdf-schema#comment",
"http://www.w3.org/2000/01/rdf-schema#subClassOf",
"http://www.w3.org/2000/01/rdf-schema#subPropertyOf",
"http://www.w3.org/2000/01/rdf-schema#domain",
"http://www.w3.org/2000/01/rdf-schema#range",
"http://www.w3.org/2002/07/owl#sameAs",
"http://www.w3.org/2002/07/owl#equivalentClass",
"http://www.w3.org/2002/07/owl#equivalentProperty",
];
for ns in &common_namespaces {
interner.intern(ns);
}
interner.stats = InternerStats {
unique_strings: interner.map.len(),
..Default::default()
};
interner
}
pub fn intern(&mut self, s: &str) -> Arc<String> {
self.stats.total_requests += 1;
if let Some(interned_key) = self.map.get_key_value(s) {
self.stats.cache_hits += 1;
self.stats.bytes_saved += s.len();
return interned_key.0 .0.clone();
}
self.stats.cache_misses += 1;
self.stats.unique_strings += 1;
let arc_string = Arc::new(s.to_string());
let key = InternedString::new(arc_string.clone());
self.map.insert(key, arc_string.clone());
arc_string
}
pub fn intern_if_beneficial<'a>(&mut self, s: &'a str, min_length: usize) -> Cow<'a, str> {
if s.contains("www.w3.org")
|| s.contains("schema.org")
|| s.contains("xmlns.com")
|| s.contains("purl.org")
|| matches!(s, "en" | "de" | "fr" | "es" | "ja" | "zh" | "ar" | "hi")
{
return Cow::Owned((*self.intern(s)).clone());
}
if s.len() >= min_length {
Cow::Owned((*self.intern(s)).clone())
} else {
Cow::Borrowed(s)
}
}
pub fn len(&self) -> usize {
self.map.len()
}
pub fn is_empty(&self) -> bool {
self.map.is_empty()
}
pub fn stats(&self) -> &InternerStats {
&self.stats
}
pub fn hit_rate(&self) -> f64 {
if self.stats.total_requests == 0 {
return 0.0;
}
self.stats.cache_hits as f64 / self.stats.total_requests as f64
}
pub fn clear(&mut self) {
self.map.clear();
self.stats = InternerStats::default();
}
pub fn memory_usage(&self) -> usize {
const HASHMAP_ENTRY_OVERHEAD: usize = 24; const ARC_OVERHEAD: usize = 16;
let mut total = 0;
total += self.map.capacity() * HASHMAP_ENTRY_OVERHEAD;
for key in self.map.keys() {
total += ARC_OVERHEAD; total += ARC_OVERHEAD; total += ARC_OVERHEAD; total += key.as_str().len(); total += std::mem::size_of::<String>(); }
total
}
pub fn shrink_to_fit(&mut self) {
self.map.shrink_to_fit();
}
}
impl Default for StringInterner {
fn default() -> Self {
Self::new()
}
}
impl InternerStats {
pub fn report(&self) -> String {
let hit_rate = if self.total_requests > 0 {
(self.cache_hits as f64 / self.total_requests as f64) * 100.0
} else {
0.0
};
format!(
"String Interning Statistics:\n\
- Total requests: {}\n\
- Cache hits: {} ({:.1}%)\n\
- Cache misses: {}\n\
- Unique strings: {}\n\
- Bytes saved: {} ({:.1} KB)",
self.total_requests,
self.cache_hits,
hit_rate,
self.cache_misses,
self.unique_strings,
self.bytes_saved,
self.bytes_saved as f64 / 1024.0
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_interning() {
let mut interner = StringInterner::new();
let s1 = interner.intern("http://example.org/");
let s2 = interner.intern("http://example.org/");
assert!(Arc::ptr_eq(&s1, &s2));
assert_eq!(interner.len(), 1);
}
#[test]
fn test_different_strings() {
let mut interner = StringInterner::new();
let s1 = interner.intern("http://example.org/a");
let s2 = interner.intern("http://example.org/b");
assert!(!Arc::ptr_eq(&s1, &s2));
assert_eq!(interner.len(), 2);
}
#[test]
fn test_stats() {
let mut interner = StringInterner::new();
interner.intern("test");
interner.intern("test");
interner.intern("test");
interner.intern("other");
let stats = interner.stats();
assert_eq!(stats.total_requests, 4);
assert_eq!(stats.cache_hits, 2); assert_eq!(stats.cache_misses, 2); assert_eq!(stats.unique_strings, 2);
}
#[test]
fn test_hit_rate() {
let mut interner = StringInterner::new();
interner.intern("a");
interner.intern("a");
interner.intern("b");
interner.intern("b");
assert_eq!(interner.hit_rate(), 0.5);
}
#[test]
fn test_common_namespaces() {
let interner = StringInterner::with_common_namespaces();
assert!(interner.len() > 10);
assert_eq!(interner.stats().total_requests, 0);
assert_eq!(interner.stats().cache_hits, 0);
}
#[test]
fn test_intern_if_beneficial() {
let mut interner = StringInterner::new();
let result = interner.intern_if_beneficial("ab", 10);
assert!(matches!(result, Cow::Borrowed(_)));
let result = interner.intern_if_beneficial("http://example.org/very/long/uri", 10);
assert!(matches!(result, Cow::Owned(_)));
let result = interner.intern_if_beneficial("www.w3.org", 100);
assert!(matches!(result, Cow::Owned(_)));
}
#[test]
fn test_memory_usage() {
let mut interner = StringInterner::new();
interner.intern("short");
interner.intern("a much longer string that takes more memory");
let usage = interner.memory_usage();
assert!(usage > 0);
println!("Memory usage: {} bytes", usage);
}
#[test]
fn test_clear() {
let mut interner = StringInterner::new();
interner.intern("test1");
interner.intern("test2");
assert_eq!(interner.len(), 2);
interner.clear();
assert_eq!(interner.len(), 0);
assert_eq!(interner.stats().total_requests, 0);
}
#[test]
fn test_rdf_use_case() {
let mut interner = StringInterner::with_common_namespaces();
let rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
let initial_unique = interner.len();
for _ in 0..1000 {
interner.intern(rdf_type);
}
assert_eq!(interner.len(), initial_unique);
assert!(interner.hit_rate() > 0.95);
println!("{}", interner.stats().report());
}
#[test]
fn test_stats_report() {
let mut interner = StringInterner::new();
interner.intern("test");
interner.intern("test");
interner.intern("other");
let report = interner.stats().report();
assert!(report.contains("Total requests: 3"));
assert!(report.contains("Cache hits: 1"));
assert!(report.contains("Unique strings: 2"));
}
}