use scirs2_core::metrics::{Counter, Histogram, Timer};
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::sync::{Arc, RwLock, Weak};
pub struct StringInterner {
strings: RwLock<HashMap<String, Weak<str>>>,
string_to_id: RwLock<HashMap<String, u32>>,
id_to_string: RwLock<HashMap<u32, Arc<str>>>,
next_id: AtomicU32,
stats: RwLock<InternerStats>,
cache_hit_counter: Arc<Counter>,
cache_miss_counter: Arc<Counter>,
intern_timer: Arc<Timer>,
string_length_histogram: Arc<Histogram>,
memory_usage_histogram: Arc<Histogram>,
}
use std::sync::atomic::AtomicU32;
#[derive(Debug, Clone, Default)]
pub struct InternerStats {
pub total_requests: usize,
pub cache_hits: usize,
pub cache_misses: usize,
pub total_strings_stored: usize,
pub memory_saved_bytes: usize,
}
#[derive(Debug, Clone)]
pub struct MemoryUsage {
pub interned_strings: usize,
pub id_mappings: usize,
pub estimated_memory_bytes: usize,
pub memory_saved_bytes: usize,
pub compression_ratio: f64,
}
#[derive(Debug, Clone)]
pub struct InternerMetrics {
pub cache_hits: u64,
pub cache_misses: u64,
pub total_requests: u64,
pub hit_ratio: f64,
pub avg_intern_time_secs: f64,
pub total_intern_observations: u64,
pub avg_string_length: f64,
pub total_memory_tracked_bytes: u64,
}
impl InternerStats {
pub fn hit_ratio(&self) -> f64 {
if self.total_requests == 0 {
0.0
} else {
self.cache_hits as f64 / self.total_requests as f64
}
}
}
impl StringInterner {
pub fn new() -> Self {
Self::with_capacity(1024) }
pub fn with_capacity(capacity: usize) -> Self {
StringInterner {
strings: RwLock::new(HashMap::with_capacity(capacity)),
string_to_id: RwLock::new(HashMap::with_capacity(capacity)),
id_to_string: RwLock::new(HashMap::with_capacity(capacity)),
next_id: AtomicU32::new(0),
stats: RwLock::new(InternerStats::default()),
cache_hit_counter: Arc::new(Counter::new("interner.cache_hits".to_string())),
cache_miss_counter: Arc::new(Counter::new("interner.cache_misses".to_string())),
intern_timer: Arc::new(Timer::new("interner.intern_time".to_string())),
string_length_histogram: Arc::new(Histogram::new("interner.string_length".to_string())),
memory_usage_histogram: Arc::new(Histogram::new("interner.memory_usage".to_string())),
}
}
pub fn intern(&self, s: &str) -> Arc<str> {
let _guard = self.intern_timer.start();
self.string_length_histogram.observe(s.len() as f64);
{
let strings = self.strings.read().expect("strings lock poisoned");
if let Some(weak_ref) = strings.get(s) {
if let Some(arc_str) = weak_ref.upgrade() {
self.cache_hit_counter.inc();
{
let mut stats = self.stats.write().expect("stats lock poisoned");
stats.total_requests += 1;
stats.cache_hits += 1;
}
return arc_str;
}
}
}
let mut strings = self.strings.write().expect("strings lock poisoned");
if let Some(weak_ref) = strings.get(s) {
if let Some(arc_str) = weak_ref.upgrade() {
self.cache_hit_counter.inc();
drop(strings); {
let mut stats = self.stats.write().expect("stats lock poisoned");
stats.total_requests += 1;
stats.cache_hits += 1;
}
return arc_str;
}
}
let arc_str: Arc<str> = Arc::from(s);
let weak_ref = Arc::downgrade(&arc_str);
strings.insert(s.to_string(), weak_ref);
self.cache_miss_counter.inc();
drop(strings); {
let mut stats = self.stats.write().expect("stats lock poisoned");
stats.total_requests += 1;
stats.cache_misses += 1;
stats.total_strings_stored += 1;
stats.memory_saved_bytes += s.len(); }
arc_str
}
pub fn intern_with_id(&self, s: &str) -> (Arc<str>, u32) {
{
let string_to_id = self
.string_to_id
.read()
.expect("string_to_id lock poisoned");
if let Some(&id) = string_to_id.get(s) {
let id_to_string = self
.id_to_string
.read()
.expect("id_to_string lock poisoned");
if let Some(arc_str) = id_to_string.get(&id) {
{
let mut stats = self.stats.write().expect("stats lock poisoned");
stats.total_requests += 1;
stats.cache_hits += 1;
}
return (arc_str.clone(), id);
}
}
}
let arc_str = self.intern(s); let id = self
.next_id
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
{
let mut string_to_id = self
.string_to_id
.write()
.expect("string_to_id lock poisoned");
string_to_id.insert(s.to_string(), id);
}
{
let mut id_to_string = self
.id_to_string
.write()
.expect("id_to_string lock poisoned");
id_to_string.insert(id, arc_str.clone());
}
(arc_str, id)
}
pub fn get_id(&self, s: &str) -> Option<u32> {
let string_to_id = self
.string_to_id
.read()
.expect("string_to_id lock poisoned");
string_to_id.get(s).copied()
}
pub fn get_string(&self, id: u32) -> Option<Arc<str>> {
let id_to_string = self
.id_to_string
.read()
.expect("id_to_string lock poisoned");
id_to_string.get(&id).cloned()
}
pub fn get_all_mappings(&self) -> Vec<(u32, Arc<str>)> {
let id_to_string = self
.id_to_string
.read()
.expect("id_to_string lock poisoned");
id_to_string
.iter()
.map(|(&id, s)| (id, s.clone()))
.collect()
}
pub fn cleanup(&self) -> usize {
let mut strings = self.strings.write().expect("strings lock poisoned");
let before = strings.len();
strings.retain(|_, weak_ref| weak_ref.strong_count() > 0);
let after = strings.len();
before - after
}
pub fn stats(&self) -> InternerStats {
self.stats.read().expect("stats lock poisoned").clone()
}
pub fn len(&self) -> usize {
let id_count = self
.string_to_id
.read()
.expect("string_to_id lock poisoned")
.len();
let string_count = self.strings.read().expect("strings lock poisoned").len();
std::cmp::max(id_count, string_count)
}
pub fn id_mapping_count(&self) -> usize {
self.string_to_id
.read()
.expect("string_to_id lock poisoned")
.len()
}
pub fn is_empty(&self) -> bool {
self.strings
.read()
.expect("strings lock poisoned")
.is_empty()
}
pub fn intern_batch(&self, strings: &[&str]) -> Vec<Arc<str>> {
let mut result = Vec::with_capacity(strings.len());
let mut to_create = Vec::new();
{
let string_map = self.strings.read().expect("strings lock poisoned");
for &s in strings {
if let Some(weak_ref) = string_map.get(s) {
if let Some(arc_str) = weak_ref.upgrade() {
result.push(arc_str);
continue;
}
}
to_create.push((result.len(), s));
result.push(Arc::from("")); }
}
if !to_create.is_empty() {
let mut string_map = self.strings.write().expect("strings lock poisoned");
let mut stats = self.stats.write().expect("stats lock poisoned");
for (index, s) in to_create {
if let Some(weak_ref) = string_map.get(s) {
if let Some(arc_str) = weak_ref.upgrade() {
result[index] = arc_str;
stats.cache_hits += 1;
continue;
}
}
let arc_str: Arc<str> = Arc::from(s);
let weak_ref = Arc::downgrade(&arc_str);
string_map.insert(s.to_string(), weak_ref);
result[index] = arc_str;
stats.cache_misses += 1;
stats.total_strings_stored += 1;
stats.memory_saved_bytes += s.len();
}
stats.total_requests += strings.len();
}
result
}
pub fn prefetch(&self, strings: &[&str]) {
let _ = self.intern_batch(strings);
}
pub fn memory_usage(&self) -> MemoryUsage {
let string_map_size = self.strings.read().expect("strings lock poisoned").len();
let id_map_size = self
.string_to_id
.read()
.expect("string_to_id lock poisoned")
.len();
let stats = self.stats.read().expect("stats lock poisoned");
MemoryUsage {
interned_strings: string_map_size,
id_mappings: id_map_size,
estimated_memory_bytes: string_map_size * 64 + id_map_size * 8, memory_saved_bytes: stats.memory_saved_bytes,
compression_ratio: if stats.memory_saved_bytes > 0 {
stats.memory_saved_bytes as f64
/ (stats.memory_saved_bytes + string_map_size * 32) as f64
} else {
0.0
},
}
}
pub fn get_metrics(&self) -> InternerMetrics {
let cache_hits = self.cache_hit_counter.get();
let cache_misses = self.cache_miss_counter.get();
let total_requests = cache_hits + cache_misses;
let hit_ratio = if total_requests > 0 {
cache_hits as f64 / total_requests as f64
} else {
0.0
};
let timer_stats = self.intern_timer.get_stats();
let string_length_stats = self.string_length_histogram.get_stats();
let memory_stats = self.memory_usage_histogram.get_stats();
InternerMetrics {
cache_hits,
cache_misses,
total_requests,
hit_ratio,
avg_intern_time_secs: timer_stats.mean,
total_intern_observations: timer_stats.count,
avg_string_length: string_length_stats.mean,
total_memory_tracked_bytes: memory_stats.sum as u64,
}
}
pub fn optimize(&self) {
let start = std::time::Instant::now();
let cleaned_count = self.cleanup();
let current_size = {
let strings = self.strings.read().expect("strings lock poisoned");
strings.len()
};
let optimal_capacity = ((current_size as f64 * 1.3) as usize).max(1024);
{
let mut strings = self.strings.write().expect("strings lock poisoned");
let mut string_to_id = self
.string_to_id
.write()
.expect("string_to_id lock poisoned");
let mut id_to_string = self
.id_to_string
.write()
.expect("id_to_string lock poisoned");
let mut new_strings = HashMap::with_capacity(optimal_capacity);
let mut new_string_to_id = HashMap::with_capacity(optimal_capacity);
let mut new_id_to_string = HashMap::with_capacity(optimal_capacity);
for (key, value) in strings.drain() {
new_strings.insert(key, value);
}
for (key, value) in string_to_id.drain() {
new_string_to_id.insert(key, value);
}
for (key, value) in id_to_string.drain() {
new_id_to_string.insert(key, value);
}
*strings = new_strings;
*string_to_id = new_string_to_id;
*id_to_string = new_id_to_string;
}
let mem_usage = self.memory_usage();
self.memory_usage_histogram
.observe(mem_usage.estimated_memory_bytes as f64);
{
let mut stats = self.stats.write().expect("stats lock poisoned");
stats.total_strings_stored = current_size;
}
let duration = start.elapsed();
tracing::debug!(
"Interner optimized: cleaned {} entries, rehashed to capacity {}, took {:?}",
cleaned_count,
optimal_capacity,
duration
);
}
}
impl Default for StringInterner {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Debug for StringInterner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("StringInterner")
.field(
"strings_count",
&self.strings.read().expect("strings lock poisoned").len(),
)
.field(
"id_mappings_count",
&self
.string_to_id
.read()
.expect("string_to_id lock poisoned")
.len(),
)
.field(
"next_id",
&self.next_id.load(std::sync::atomic::Ordering::Relaxed),
)
.field("stats", &self.stats.read().expect("stats lock poisoned"))
.finish()
}
}
lazy_static::lazy_static! {
pub static ref IRI_INTERNER: StringInterner = StringInterner::new();
pub static ref DATATYPE_INTERNER: StringInterner = StringInterner::new();
pub static ref LANGUAGE_INTERNER: StringInterner = StringInterner::new();
pub static ref STRING_INTERNER: StringInterner = StringInterner::new();
}
#[derive(Debug, Clone)]
pub struct InternedString {
inner: Arc<str>,
}
impl InternedString {
pub fn new(s: &str) -> Self {
InternedString {
inner: IRI_INTERNER.intern(s),
}
}
pub fn new_with_interner(s: &str, interner: &StringInterner) -> Self {
InternedString {
inner: interner.intern(s),
}
}
pub fn new_datatype(s: &str) -> Self {
InternedString {
inner: DATATYPE_INTERNER.intern(s),
}
}
pub fn new_language(s: &str) -> Self {
InternedString {
inner: LANGUAGE_INTERNER.intern(s),
}
}
pub fn as_str(&self) -> &str {
&self.inner
}
pub fn as_arc_str(&self) -> &Arc<str> {
&self.inner
}
pub fn into_arc_str(self) -> Arc<str> {
self.inner
}
}
impl std::fmt::Display for InternedString {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.inner)
}
}
impl std::ops::Deref for InternedString {
type Target = str;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
impl AsRef<str> for InternedString {
fn as_ref(&self) -> &str {
&self.inner
}
}
impl PartialEq for InternedString {
fn eq(&self, other: &Self) -> bool {
Arc::ptr_eq(&self.inner, &other.inner) || self.inner == other.inner
}
}
impl Eq for InternedString {}
impl Hash for InternedString {
fn hash<H: Hasher>(&self, state: &mut H) {
self.inner.hash(state);
}
}
impl PartialOrd for InternedString {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for InternedString {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.inner.cmp(&other.inner)
}
}
impl From<&str> for InternedString {
fn from(s: &str) -> Self {
InternedString::new(s)
}
}
impl From<String> for InternedString {
fn from(s: String) -> Self {
InternedString::new(&s)
}
}
pub trait RdfVocabulary {
const XSD_NS: &'static str = "http://www.w3.org/2001/XMLSchema#";
const RDF_NS: &'static str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
const RDFS_NS: &'static str = "http://www.w3.org/2000/01/rdf-schema#";
const OWL_NS: &'static str = "http://www.w3.org/2002/07/owl#";
fn xsd_string() -> InternedString {
InternedString::new_datatype(&format!("{}string", Self::XSD_NS))
}
fn xsd_integer() -> InternedString {
InternedString::new_datatype(&format!("{}integer", Self::XSD_NS))
}
fn xsd_decimal() -> InternedString {
InternedString::new_datatype(&format!("{}decimal", Self::XSD_NS))
}
fn xsd_boolean() -> InternedString {
InternedString::new_datatype(&format!("{}boolean", Self::XSD_NS))
}
fn xsd_double() -> InternedString {
InternedString::new_datatype(&format!("{}double", Self::XSD_NS))
}
fn xsd_float() -> InternedString {
InternedString::new_datatype(&format!("{}float", Self::XSD_NS))
}
fn xsd_date_time() -> InternedString {
InternedString::new_datatype(&format!("{}dateTime", Self::XSD_NS))
}
fn rdf_type() -> InternedString {
InternedString::new(&format!("{}type", Self::RDF_NS))
}
fn rdfs_label() -> InternedString {
InternedString::new(&format!("{}label", Self::RDFS_NS))
}
fn rdfs_comment() -> InternedString {
InternedString::new(&format!("{}comment", Self::RDFS_NS))
}
}
impl RdfVocabulary for InternedString {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_string_interner_basic() {
let interner = StringInterner::new();
let s1 = interner.intern("http://example.org/test");
let s2 = interner.intern("http://example.org/test");
let s3 = interner.intern("http://example.org/different");
assert!(Arc::ptr_eq(&s1, &s2));
assert!(!Arc::ptr_eq(&s1, &s3));
assert_eq!(s1.as_ref(), "http://example.org/test");
assert_eq!(s2.as_ref(), "http://example.org/test");
assert_eq!(s3.as_ref(), "http://example.org/different");
}
#[test]
fn test_string_interner_stats() {
let interner = StringInterner::new();
let _s1 = interner.intern("test");
let stats = interner.stats();
assert_eq!(stats.total_requests, 1);
assert_eq!(stats.cache_misses, 1);
assert_eq!(stats.cache_hits, 0);
let _s2 = interner.intern("test");
let stats = interner.stats();
assert_eq!(stats.total_requests, 2);
assert_eq!(stats.cache_misses, 1);
assert_eq!(stats.cache_hits, 1);
assert_eq!(stats.hit_ratio(), 0.5);
}
#[test]
fn test_string_interner_cleanup() {
let interner = StringInterner::new();
{
let _s1 = interner.intern("temporary");
assert_eq!(interner.len(), 1);
}
interner.cleanup();
assert_eq!(interner.len(), 0);
}
#[test]
fn test_interned_string_creation() {
let s1 = InternedString::new("http://example.org/test");
let s2 = InternedString::new("http://example.org/test");
let s3 = InternedString::new("http://example.org/different");
assert_eq!(s1, s2);
assert_ne!(s1, s3);
assert_eq!(s1.as_str(), "http://example.org/test");
}
#[test]
fn test_interned_string_ordering() {
let s1 = InternedString::new("apple");
let s2 = InternedString::new("banana");
let s3 = InternedString::new("apple");
assert!(s1 < s2);
assert!(s2 > s1);
assert_eq!(s1, s3);
let mut strings = vec![s2.clone(), s1.clone(), s3.clone()];
strings.sort();
assert_eq!(strings, vec![s1, s3, s2]);
}
#[test]
fn test_interned_string_hashing() {
use std::collections::HashMap;
let s1 = InternedString::new("test");
let s2 = InternedString::new("test");
let s3 = InternedString::new("different");
let mut map = HashMap::new();
map.insert(s1.clone(), "value1");
map.insert(s3.clone(), "value2");
assert_eq!(map.get(&s2), Some(&"value1"));
assert_eq!(map.get(&s3), Some(&"value2"));
assert_eq!(map.len(), 2);
}
#[test]
fn test_global_interners() {
let iri1 = InternedString::new("http://example.org/test");
let iri2 = InternedString::new("http://example.org/test");
let datatype1 = InternedString::new_datatype("http://www.w3.org/2001/XMLSchema#string");
let datatype2 = InternedString::new_datatype("http://www.w3.org/2001/XMLSchema#string");
let lang1 = InternedString::new_language("en");
let lang2 = InternedString::new_language("en");
assert_eq!(iri1, iri2);
assert_eq!(datatype1, datatype2);
assert_eq!(lang1, lang2);
}
#[test]
fn test_rdf_vocabulary() {
let string_type = InternedString::xsd_string();
let integer_type = InternedString::xsd_integer();
let rdf_type = InternedString::rdf_type();
assert_eq!(
string_type.as_str(),
"http://www.w3.org/2001/XMLSchema#string"
);
assert_eq!(
integer_type.as_str(),
"http://www.w3.org/2001/XMLSchema#integer"
);
assert_eq!(
rdf_type.as_str(),
"http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
);
let string_type2 = InternedString::xsd_string();
assert_eq!(string_type, string_type2);
}
#[test]
fn test_interned_string_display() {
let s = InternedString::new("http://example.org/test");
assert_eq!(format!("{s}"), "http://example.org/test");
}
#[test]
fn test_interned_string_deref() {
let s = InternedString::new("test");
assert_eq!(&*s, "test");
assert_eq!(s.len(), 4);
assert!(s.starts_with("te"));
}
#[test]
fn test_interned_string_conversions() {
let s1 = InternedString::from("test");
let s2 = InternedString::from("test".to_string());
assert_eq!(s1, s2);
assert_eq!(s1.as_str(), "test");
}
#[test]
fn test_concurrent_interning() {
use std::sync::Arc;
use std::thread;
let interner = Arc::new(StringInterner::new());
let handles: Vec<_> = (0..10)
.map(|i| {
let interner = Arc::clone(&interner);
thread::spawn(move || {
let s = format!("http://example.org/test{}", i % 3);
(0..100).map(|_| interner.intern(&s)).collect::<Vec<_>>()
})
})
.collect();
let results: Vec<Vec<Arc<str>>> = handles
.into_iter()
.map(|h| h.join().expect("thread should not panic"))
.collect();
for result_set in &results {
for (i, s1) in result_set.iter().enumerate() {
for s2 in &result_set[i + 1..] {
if s1.as_ref() == s2.as_ref() {
assert!(Arc::ptr_eq(s1, s2));
}
}
}
}
assert!(interner.len() <= 3);
}
#[test]
fn test_term_id_mapping() {
let interner = StringInterner::new();
let (arc1, id1) = interner.intern_with_id("test_string");
let (arc2, id2) = interner.intern_with_id("test_string");
assert_eq!(id1, id2);
assert!(Arc::ptr_eq(&arc1, &arc2));
let (arc3, id3) = interner.intern_with_id("different_string");
assert_ne!(id1, id3);
assert!(!Arc::ptr_eq(&arc1, &arc3));
assert_eq!(interner.get_id("test_string"), Some(id1));
assert_eq!(interner.get_id("different_string"), Some(id3));
assert_eq!(interner.get_id("nonexistent"), None);
assert_eq!(
interner
.get_string(id1)
.expect("operation should succeed")
.as_ref(),
"test_string"
);
assert_eq!(
interner
.get_string(id3)
.expect("operation should succeed")
.as_ref(),
"different_string"
);
assert_eq!(interner.get_string(999), None);
}
#[test]
fn test_id_mapping_stats() {
let interner = StringInterner::new();
assert_eq!(interner.id_mapping_count(), 0);
interner.intern_with_id("string1");
assert_eq!(interner.id_mapping_count(), 1);
interner.intern_with_id("string2");
assert_eq!(interner.id_mapping_count(), 2);
interner.intern_with_id("string1");
assert_eq!(interner.id_mapping_count(), 2);
}
#[test]
fn test_get_all_mappings() {
let interner = StringInterner::new();
let (_, id1) = interner.intern_with_id("first");
let (_, id2) = interner.intern_with_id("second");
let (_, id3) = interner.intern_with_id("third");
let mappings = interner.get_all_mappings();
assert_eq!(mappings.len(), 3);
let mut found_ids = [false; 3];
for (id, string) in mappings {
match string.as_ref() {
"first" => {
assert_eq!(id, id1);
found_ids[0] = true;
}
"second" => {
assert_eq!(id, id2);
found_ids[1] = true;
}
"third" => {
assert_eq!(id, id3);
found_ids[2] = true;
}
_ => panic!("Unexpected string in mappings"),
}
}
assert!(found_ids.iter().all(|&found| found));
}
#[test]
fn test_mixed_interning_modes() {
let interner = StringInterner::new();
let arc1 = interner.intern("regular");
let (_arc2, id2) = interner.intern_with_id("with_id");
let arc3 = interner.intern("regular");
assert!(Arc::ptr_eq(&arc1, &arc3));
assert_eq!(
interner
.get_string(id2)
.expect("operation should succeed")
.as_ref(),
"with_id"
);
assert!(interner.len() >= 2);
}
}