Skip to main content

oxirs_core/
interning.rs

1//! String interning system for performance optimization
2//!
3//! This module provides efficient string interning for commonly used strings
4//! like IRIs, datatype URIs, and other RDF terms. String interning reduces
5//! memory usage and improves comparison performance by ensuring that equal
6//! strings are stored only once and can be compared by pointer equality.
7//!
8//! ## Performance Monitoring
9//!
10//! The interner integrates SciRS2-core metrics for comprehensive monitoring:
11//! - Cache hit/miss rates
12//! - Intern operation timing
13//! - Memory usage tracking
14//! - Deduplication effectiveness
15
16use scirs2_core::metrics::{Counter, Histogram, Timer};
17use std::collections::HashMap;
18use std::hash::{Hash, Hasher};
19use std::sync::{Arc, RwLock, Weak};
20
21/// A thread-safe string interner that deduplicates strings with ID mapping
22///
23/// Integrates SciRS2-core metrics for production-grade performance monitoring:
24/// - Automatic cache hit/miss tracking
25/// - Intern operation timing
26/// - Memory usage histograms
27/// - Deduplication effectiveness metrics
28pub struct StringInterner {
29    /// Map from string content to weak references of interned strings
30    strings: RwLock<HashMap<String, Weak<str>>>,
31    /// Bidirectional mapping between strings and numeric IDs
32    string_to_id: RwLock<HashMap<String, u32>>,
33    /// Map from ID back to string
34    id_to_string: RwLock<HashMap<u32, Arc<str>>>,
35    /// Next available ID
36    next_id: AtomicU32,
37    /// Statistics for monitoring performance
38    stats: RwLock<InternerStats>,
39    /// SciRS2 metrics
40    cache_hit_counter: Arc<Counter>,
41    cache_miss_counter: Arc<Counter>,
42    intern_timer: Arc<Timer>,
43    string_length_histogram: Arc<Histogram>,
44    memory_usage_histogram: Arc<Histogram>,
45}
46
47/// Atomic 32-bit unsigned integer type for thread-safe ID generation
48use std::sync::atomic::AtomicU32;
49
50/// Statistics for monitoring string interner performance
51#[derive(Debug, Clone, Default)]
52pub struct InternerStats {
53    pub total_requests: usize,
54    pub cache_hits: usize,
55    pub cache_misses: usize,
56    pub total_strings_stored: usize,
57    pub memory_saved_bytes: usize,
58}
59
60/// Memory usage statistics for performance monitoring
61#[derive(Debug, Clone)]
62pub struct MemoryUsage {
63    pub interned_strings: usize,
64    pub id_mappings: usize,
65    pub estimated_memory_bytes: usize,
66    pub memory_saved_bytes: usize,
67    pub compression_ratio: f64,
68}
69
70/// SciRS2 metrics for interner performance
71#[derive(Debug, Clone)]
72pub struct InternerMetrics {
73    /// Total cache hits
74    pub cache_hits: u64,
75    /// Total cache misses
76    pub cache_misses: u64,
77    /// Total intern requests
78    pub total_requests: u64,
79    /// Cache hit ratio (0.0 to 1.0)
80    pub hit_ratio: f64,
81    /// Average intern operation time in seconds
82    pub avg_intern_time_secs: f64,
83    /// Total timing observations recorded
84    pub total_intern_observations: u64,
85    /// Average string length
86    pub avg_string_length: f64,
87    /// Total memory tracked in bytes
88    pub total_memory_tracked_bytes: u64,
89}
90
91impl InternerStats {
92    pub fn hit_ratio(&self) -> f64 {
93        if self.total_requests == 0 {
94            0.0
95        } else {
96            self.cache_hits as f64 / self.total_requests as f64
97        }
98    }
99}
100
101impl StringInterner {
102    /// Create a new string interner with ID mapping and SciRS2 metrics
103    ///
104    /// Automatically tracks:
105    /// - Cache hit/miss rates for intern operations
106    /// - Operation timing for performance analysis
107    /// - String length distribution
108    /// - Memory usage patterns
109    pub fn new() -> Self {
110        Self::with_capacity(1024) // Default capacity for typical usage
111    }
112
113    /// Create a new string interner with specified capacity
114    ///
115    /// Pre-allocates HashMaps to the given capacity to reduce reallocation overhead.
116    /// Initializes all SciRS2 metrics for comprehensive monitoring.
117    pub fn with_capacity(capacity: usize) -> Self {
118        StringInterner {
119            strings: RwLock::new(HashMap::with_capacity(capacity)),
120            string_to_id: RwLock::new(HashMap::with_capacity(capacity)),
121            id_to_string: RwLock::new(HashMap::with_capacity(capacity)),
122            next_id: AtomicU32::new(0),
123            stats: RwLock::new(InternerStats::default()),
124            cache_hit_counter: Arc::new(Counter::new("interner.cache_hits".to_string())),
125            cache_miss_counter: Arc::new(Counter::new("interner.cache_misses".to_string())),
126            intern_timer: Arc::new(Timer::new("interner.intern_time".to_string())),
127            string_length_histogram: Arc::new(Histogram::new("interner.string_length".to_string())),
128            memory_usage_histogram: Arc::new(Histogram::new("interner.memory_usage".to_string())),
129        }
130    }
131
132    /// Intern a string, returning an `Arc<str>` that can be cheaply cloned and compared
133    ///
134    /// This operation is tracked with SciRS2 metrics:
135    /// - Cache hits/misses
136    /// - Operation timing
137    /// - String length distribution
138    pub fn intern(&self, s: &str) -> Arc<str> {
139        let _guard = self.intern_timer.start();
140
141        // Track string length
142        self.string_length_histogram.observe(s.len() as f64);
143
144        // Fast path: try to get existing string with read lock
145        {
146            let strings = self.strings.read().expect("strings lock poisoned");
147            if let Some(weak_ref) = strings.get(s) {
148                if let Some(arc_str) = weak_ref.upgrade() {
149                    // Update stats
150                    self.cache_hit_counter.inc();
151                    {
152                        let mut stats = self.stats.write().expect("stats lock poisoned");
153                        stats.total_requests += 1;
154                        stats.cache_hits += 1;
155                    }
156                    return arc_str;
157                }
158            }
159        }
160
161        // Slow path: need to create new string with write lock
162        let mut strings = self.strings.write().expect("strings lock poisoned");
163
164        // Double-check in case another thread added it while we were waiting
165        if let Some(weak_ref) = strings.get(s) {
166            if let Some(arc_str) = weak_ref.upgrade() {
167                // Update stats
168                self.cache_hit_counter.inc();
169                drop(strings); // Release write lock early
170                {
171                    let mut stats = self.stats.write().expect("stats lock poisoned");
172                    stats.total_requests += 1;
173                    stats.cache_hits += 1;
174                }
175                return arc_str;
176            }
177        }
178
179        // Create new interned string
180        let arc_str: Arc<str> = Arc::from(s);
181        let weak_ref = Arc::downgrade(&arc_str);
182        strings.insert(s.to_string(), weak_ref);
183
184        // Update stats
185        self.cache_miss_counter.inc();
186        drop(strings); // Release write lock early
187        {
188            let mut stats = self.stats.write().expect("stats lock poisoned");
189            stats.total_requests += 1;
190            stats.cache_misses += 1;
191            stats.total_strings_stored += 1;
192            stats.memory_saved_bytes += s.len(); // Approximate memory saved on subsequent hits
193        }
194
195        arc_str
196    }
197
198    /// Intern a string and return both the `Arc<str>` and its numeric ID
199    pub fn intern_with_id(&self, s: &str) -> (Arc<str>, u32) {
200        // Fast path: check if we already have this string and its ID
201        {
202            let string_to_id = self
203                .string_to_id
204                .read()
205                .expect("string_to_id lock poisoned");
206            if let Some(&id) = string_to_id.get(s) {
207                // We have the ID, now get the Arc<str>
208                let id_to_string = self
209                    .id_to_string
210                    .read()
211                    .expect("id_to_string lock poisoned");
212                if let Some(arc_str) = id_to_string.get(&id) {
213                    // Update stats
214                    {
215                        let mut stats = self.stats.write().expect("stats lock poisoned");
216                        stats.total_requests += 1;
217                        stats.cache_hits += 1;
218                    }
219                    return (arc_str.clone(), id);
220                }
221            }
222        }
223
224        // Slow path: need to create new entry
225        let arc_str = self.intern(s); // This will handle the string interning
226        let id = self
227            .next_id
228            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
229
230        // Update the ID mappings
231        {
232            let mut string_to_id = self
233                .string_to_id
234                .write()
235                .expect("string_to_id lock poisoned");
236            string_to_id.insert(s.to_string(), id);
237        }
238        {
239            let mut id_to_string = self
240                .id_to_string
241                .write()
242                .expect("id_to_string lock poisoned");
243            id_to_string.insert(id, arc_str.clone());
244        }
245
246        (arc_str, id)
247    }
248
249    /// Get the ID for a string if it's already interned
250    pub fn get_id(&self, s: &str) -> Option<u32> {
251        let string_to_id = self
252            .string_to_id
253            .read()
254            .expect("string_to_id lock poisoned");
255        string_to_id.get(s).copied()
256    }
257
258    /// Get the string for an ID if it exists
259    pub fn get_string(&self, id: u32) -> Option<Arc<str>> {
260        let id_to_string = self
261            .id_to_string
262            .read()
263            .expect("id_to_string lock poisoned");
264        id_to_string.get(&id).cloned()
265    }
266
267    /// Get all ID mappings (useful for serialization/debugging)
268    pub fn get_all_mappings(&self) -> Vec<(u32, Arc<str>)> {
269        let id_to_string = self
270            .id_to_string
271            .read()
272            .expect("id_to_string lock poisoned");
273        id_to_string
274            .iter()
275            .map(|(&id, s)| (id, s.clone()))
276            .collect()
277    }
278
279    /// Clean up expired weak references to save memory
280    ///
281    /// Returns the number of entries cleaned up
282    pub fn cleanup(&self) -> usize {
283        let mut strings = self.strings.write().expect("strings lock poisoned");
284        let before = strings.len();
285        strings.retain(|_, weak_ref| weak_ref.strong_count() > 0);
286        let after = strings.len();
287        before - after
288    }
289
290    /// Get current statistics
291    pub fn stats(&self) -> InternerStats {
292        self.stats.read().expect("stats lock poisoned").clone()
293    }
294
295    /// Get the number of unique strings currently stored
296    pub fn len(&self) -> usize {
297        // Return the maximum of both counts to handle mixed usage
298        let id_count = self
299            .string_to_id
300            .read()
301            .expect("string_to_id lock poisoned")
302            .len();
303        let string_count = self.strings.read().expect("strings lock poisoned").len();
304        std::cmp::max(id_count, string_count)
305    }
306
307    /// Get the number of strings with ID mappings
308    pub fn id_mapping_count(&self) -> usize {
309        self.string_to_id
310            .read()
311            .expect("string_to_id lock poisoned")
312            .len()
313    }
314
315    /// Check if the interner is empty
316    pub fn is_empty(&self) -> bool {
317        self.strings
318            .read()
319            .expect("strings lock poisoned")
320            .is_empty()
321    }
322
323    /// Batch intern multiple strings for improved performance
324    /// Returns a Vec of `Arc<str>` in the same order as input
325    pub fn intern_batch(&self, strings: &[&str]) -> Vec<Arc<str>> {
326        let mut result = Vec::with_capacity(strings.len());
327        let mut to_create = Vec::new();
328
329        // First pass: collect existing strings with read lock
330        {
331            let string_map = self.strings.read().expect("strings lock poisoned");
332            for &s in strings {
333                if let Some(weak_ref) = string_map.get(s) {
334                    if let Some(arc_str) = weak_ref.upgrade() {
335                        result.push(arc_str);
336                        continue;
337                    }
338                }
339                to_create.push((result.len(), s));
340                result.push(Arc::from("")); // Placeholder
341            }
342        }
343
344        // Second pass: create missing strings with write lock
345        if !to_create.is_empty() {
346            let mut string_map = self.strings.write().expect("strings lock poisoned");
347            let mut stats = self.stats.write().expect("stats lock poisoned");
348
349            for (index, s) in to_create {
350                // Double-check in case another thread added it
351                if let Some(weak_ref) = string_map.get(s) {
352                    if let Some(arc_str) = weak_ref.upgrade() {
353                        result[index] = arc_str;
354                        stats.cache_hits += 1;
355                        continue;
356                    }
357                }
358
359                // Create new interned string
360                let arc_str: Arc<str> = Arc::from(s);
361                let weak_ref = Arc::downgrade(&arc_str);
362                string_map.insert(s.to_string(), weak_ref);
363                result[index] = arc_str;
364
365                stats.cache_misses += 1;
366                stats.total_strings_stored += 1;
367                stats.memory_saved_bytes += s.len();
368            }
369
370            stats.total_requests += strings.len();
371        }
372
373        result
374    }
375
376    /// Prefetch strings into the interner cache for improved performance
377    /// This is useful when you know you'll need certain strings soon
378    pub fn prefetch(&self, strings: &[&str]) {
379        let _ = self.intern_batch(strings);
380    }
381
382    /// Get memory usage statistics for performance monitoring
383    pub fn memory_usage(&self) -> MemoryUsage {
384        let string_map_size = self.strings.read().expect("strings lock poisoned").len();
385        let id_map_size = self
386            .string_to_id
387            .read()
388            .expect("string_to_id lock poisoned")
389            .len();
390        let stats = self.stats.read().expect("stats lock poisoned");
391
392        MemoryUsage {
393            interned_strings: string_map_size,
394            id_mappings: id_map_size,
395            estimated_memory_bytes: string_map_size * 64 + id_map_size * 8, // Rough estimate
396            memory_saved_bytes: stats.memory_saved_bytes,
397            compression_ratio: if stats.memory_saved_bytes > 0 {
398                stats.memory_saved_bytes as f64
399                    / (stats.memory_saved_bytes + string_map_size * 32) as f64
400            } else {
401                0.0
402            },
403        }
404    }
405
406    /// Get comprehensive performance metrics from SciRS2
407    ///
408    /// Returns detailed statistics including:
409    /// - Cache hit/miss counts and ratios
410    /// - Operation timing statistics
411    /// - String length distribution
412    /// - Memory usage patterns
413    pub fn get_metrics(&self) -> InternerMetrics {
414        let cache_hits = self.cache_hit_counter.get();
415        let cache_misses = self.cache_miss_counter.get();
416        let total_requests = cache_hits + cache_misses;
417        let hit_ratio = if total_requests > 0 {
418            cache_hits as f64 / total_requests as f64
419        } else {
420            0.0
421        };
422
423        let timer_stats = self.intern_timer.get_stats();
424        let string_length_stats = self.string_length_histogram.get_stats();
425        let memory_stats = self.memory_usage_histogram.get_stats();
426
427        InternerMetrics {
428            cache_hits,
429            cache_misses,
430            total_requests,
431            hit_ratio,
432            avg_intern_time_secs: timer_stats.mean,
433            total_intern_observations: timer_stats.count,
434            avg_string_length: string_length_stats.mean,
435            total_memory_tracked_bytes: memory_stats.sum as u64,
436        }
437    }
438
439    /// Optimize the interner by cleaning up and compacting data structures
440    ///
441    /// Performs comprehensive optimization:
442    /// - Cleans up expired weak references
443    /// - Rehashes HashMaps with optimal capacity
444    /// - Updates memory usage statistics
445    /// - Tracks optimization impact with SciRS2 metrics
446    pub fn optimize(&self) {
447        let start = std::time::Instant::now();
448
449        // Clean up expired weak references
450        let cleaned_count = self.cleanup();
451
452        // Get current sizes
453        let current_size = {
454            let strings = self.strings.read().expect("strings lock poisoned");
455            strings.len()
456        };
457
458        // Rehash with optimal capacity (1.3x current size to reduce future reallocations)
459        let optimal_capacity = ((current_size as f64 * 1.3) as usize).max(1024);
460
461        {
462            let mut strings = self.strings.write().expect("strings lock poisoned");
463            let mut string_to_id = self
464                .string_to_id
465                .write()
466                .expect("string_to_id lock poisoned");
467            let mut id_to_string = self
468                .id_to_string
469                .write()
470                .expect("id_to_string lock poisoned");
471
472            // Create new maps with optimal capacity
473            let mut new_strings = HashMap::with_capacity(optimal_capacity);
474            let mut new_string_to_id = HashMap::with_capacity(optimal_capacity);
475            let mut new_id_to_string = HashMap::with_capacity(optimal_capacity);
476
477            // Move data to new maps (rehashing in the process)
478            for (key, value) in strings.drain() {
479                new_strings.insert(key, value);
480            }
481            for (key, value) in string_to_id.drain() {
482                new_string_to_id.insert(key, value);
483            }
484            for (key, value) in id_to_string.drain() {
485                new_id_to_string.insert(key, value);
486            }
487
488            // Replace with optimized maps
489            *strings = new_strings;
490            *string_to_id = new_string_to_id;
491            *id_to_string = new_id_to_string;
492        }
493
494        // Calculate and track memory usage
495        let mem_usage = self.memory_usage();
496        self.memory_usage_histogram
497            .observe(mem_usage.estimated_memory_bytes as f64);
498
499        // Update stats
500        {
501            let mut stats = self.stats.write().expect("stats lock poisoned");
502            stats.total_strings_stored = current_size;
503        }
504
505        let duration = start.elapsed();
506        tracing::debug!(
507            "Interner optimized: cleaned {} entries, rehashed to capacity {}, took {:?}",
508            cleaned_count,
509            optimal_capacity,
510            duration
511        );
512    }
513}
514
515impl Default for StringInterner {
516    fn default() -> Self {
517        Self::new()
518    }
519}
520
521impl std::fmt::Debug for StringInterner {
522    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
523        f.debug_struct("StringInterner")
524            .field(
525                "strings_count",
526                &self.strings.read().expect("strings lock poisoned").len(),
527            )
528            .field(
529                "id_mappings_count",
530                &self
531                    .string_to_id
532                    .read()
533                    .expect("string_to_id lock poisoned")
534                    .len(),
535            )
536            .field(
537                "next_id",
538                &self.next_id.load(std::sync::atomic::Ordering::Relaxed),
539            )
540            .field("stats", &self.stats.read().expect("stats lock poisoned"))
541            .finish()
542    }
543}
544
545lazy_static::lazy_static! {
546    // Global interner instances for common string types
547    /// Global interner for IRI strings
548    pub static ref IRI_INTERNER: StringInterner = StringInterner::new();
549
550    /// Global interner for datatype IRIs
551    pub static ref DATATYPE_INTERNER: StringInterner = StringInterner::new();
552
553    /// Global interner for language tags
554    pub static ref LANGUAGE_INTERNER: StringInterner = StringInterner::new();
555
556    /// Global interner for general strings (JSON-LD processing)
557    pub static ref STRING_INTERNER: StringInterner = StringInterner::new();
558}
559
560/// An interned string that supports efficient comparison and hashing
561#[derive(Debug, Clone)]
562pub struct InternedString {
563    inner: Arc<str>,
564}
565
566impl InternedString {
567    /// Create a new interned string using the default IRI interner
568    pub fn new(s: &str) -> Self {
569        InternedString {
570            inner: IRI_INTERNER.intern(s),
571        }
572    }
573
574    /// Create a new interned string using a specific interner
575    pub fn new_with_interner(s: &str, interner: &StringInterner) -> Self {
576        InternedString {
577            inner: interner.intern(s),
578        }
579    }
580
581    /// Create an interned datatype string
582    pub fn new_datatype(s: &str) -> Self {
583        InternedString {
584            inner: DATATYPE_INTERNER.intern(s),
585        }
586    }
587
588    /// Create an interned language tag string
589    pub fn new_language(s: &str) -> Self {
590        InternedString {
591            inner: LANGUAGE_INTERNER.intern(s),
592        }
593    }
594
595    /// Get the string content
596    pub fn as_str(&self) -> &str {
597        &self.inner
598    }
599
600    /// Get the inner `Arc<str>` for zero-copy operations
601    pub fn as_arc_str(&self) -> &Arc<str> {
602        &self.inner
603    }
604
605    /// Convert into the inner `Arc<str>`
606    pub fn into_arc_str(self) -> Arc<str> {
607        self.inner
608    }
609}
610
611impl std::fmt::Display for InternedString {
612    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
613        write!(f, "{}", self.inner)
614    }
615}
616
617impl std::ops::Deref for InternedString {
618    type Target = str;
619
620    fn deref(&self) -> &Self::Target {
621        &self.inner
622    }
623}
624
625impl AsRef<str> for InternedString {
626    fn as_ref(&self) -> &str {
627        &self.inner
628    }
629}
630
631impl PartialEq for InternedString {
632    fn eq(&self, other: &Self) -> bool {
633        // Fast pointer comparison first
634        Arc::ptr_eq(&self.inner, &other.inner) || self.inner == other.inner
635    }
636}
637
638impl Eq for InternedString {}
639
640impl Hash for InternedString {
641    fn hash<H: Hasher>(&self, state: &mut H) {
642        // Hash the string content, not the pointer
643        self.inner.hash(state);
644    }
645}
646
647impl PartialOrd for InternedString {
648    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
649        Some(self.cmp(other))
650    }
651}
652
653impl Ord for InternedString {
654    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
655        self.inner.cmp(&other.inner)
656    }
657}
658
659impl From<&str> for InternedString {
660    fn from(s: &str) -> Self {
661        InternedString::new(s)
662    }
663}
664
665impl From<String> for InternedString {
666    fn from(s: String) -> Self {
667        InternedString::new(&s)
668    }
669}
670
671/// Extension trait for string interning common RDF vocabulary
672pub trait RdfVocabulary {
673    /// Common XSD namespace
674    const XSD_NS: &'static str = "http://www.w3.org/2001/XMLSchema#";
675    /// Common RDF namespace
676    const RDF_NS: &'static str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
677    /// Common RDFS namespace
678    const RDFS_NS: &'static str = "http://www.w3.org/2000/01/rdf-schema#";
679    /// Common OWL namespace  
680    const OWL_NS: &'static str = "http://www.w3.org/2002/07/owl#";
681
682    fn xsd_string() -> InternedString {
683        InternedString::new_datatype(&format!("{}string", Self::XSD_NS))
684    }
685
686    fn xsd_integer() -> InternedString {
687        InternedString::new_datatype(&format!("{}integer", Self::XSD_NS))
688    }
689
690    fn xsd_decimal() -> InternedString {
691        InternedString::new_datatype(&format!("{}decimal", Self::XSD_NS))
692    }
693
694    fn xsd_boolean() -> InternedString {
695        InternedString::new_datatype(&format!("{}boolean", Self::XSD_NS))
696    }
697
698    fn xsd_double() -> InternedString {
699        InternedString::new_datatype(&format!("{}double", Self::XSD_NS))
700    }
701
702    fn xsd_float() -> InternedString {
703        InternedString::new_datatype(&format!("{}float", Self::XSD_NS))
704    }
705
706    fn xsd_date_time() -> InternedString {
707        InternedString::new_datatype(&format!("{}dateTime", Self::XSD_NS))
708    }
709
710    fn rdf_type() -> InternedString {
711        InternedString::new(&format!("{}type", Self::RDF_NS))
712    }
713
714    fn rdfs_label() -> InternedString {
715        InternedString::new(&format!("{}label", Self::RDFS_NS))
716    }
717
718    fn rdfs_comment() -> InternedString {
719        InternedString::new(&format!("{}comment", Self::RDFS_NS))
720    }
721}
722
723/// Implement RdfVocabulary for InternedString to provide easy access to common terms
724impl RdfVocabulary for InternedString {}
725
726#[cfg(test)]
727mod tests {
728    use super::*;
729
730    #[test]
731    fn test_string_interner_basic() {
732        let interner = StringInterner::new();
733
734        let s1 = interner.intern("http://example.org/test");
735        let s2 = interner.intern("http://example.org/test");
736        let s3 = interner.intern("http://example.org/different");
737
738        // Same string should return same Arc (pointer equality)
739        assert!(Arc::ptr_eq(&s1, &s2));
740        assert!(!Arc::ptr_eq(&s1, &s3));
741
742        // Content should be equal
743        assert_eq!(s1.as_ref(), "http://example.org/test");
744        assert_eq!(s2.as_ref(), "http://example.org/test");
745        assert_eq!(s3.as_ref(), "http://example.org/different");
746    }
747
748    #[test]
749    fn test_string_interner_stats() {
750        let interner = StringInterner::new();
751
752        // First request - cache miss
753        let _s1 = interner.intern("test");
754        let stats = interner.stats();
755        assert_eq!(stats.total_requests, 1);
756        assert_eq!(stats.cache_misses, 1);
757        assert_eq!(stats.cache_hits, 0);
758
759        // Second request for same string - cache hit
760        let _s2 = interner.intern("test");
761        let stats = interner.stats();
762        assert_eq!(stats.total_requests, 2);
763        assert_eq!(stats.cache_misses, 1);
764        assert_eq!(stats.cache_hits, 1);
765        assert_eq!(stats.hit_ratio(), 0.5);
766    }
767
768    #[test]
769    fn test_string_interner_cleanup() {
770        let interner = StringInterner::new();
771
772        {
773            let _s1 = interner.intern("temporary");
774            assert_eq!(interner.len(), 1);
775        } // s1 goes out of scope
776
777        interner.cleanup();
778        assert_eq!(interner.len(), 0);
779    }
780
781    #[test]
782    fn test_interned_string_creation() {
783        let s1 = InternedString::new("http://example.org/test");
784        let s2 = InternedString::new("http://example.org/test");
785        let s3 = InternedString::new("http://example.org/different");
786
787        assert_eq!(s1, s2);
788        assert_ne!(s1, s3);
789        assert_eq!(s1.as_str(), "http://example.org/test");
790    }
791
792    #[test]
793    fn test_interned_string_ordering() {
794        let s1 = InternedString::new("apple");
795        let s2 = InternedString::new("banana");
796        let s3 = InternedString::new("apple");
797
798        assert!(s1 < s2);
799        assert!(s2 > s1);
800        assert_eq!(s1, s3);
801
802        // Test that ordering is consistent
803        let mut strings = vec![s2.clone(), s1.clone(), s3.clone()];
804        strings.sort();
805        assert_eq!(strings, vec![s1, s3, s2]);
806    }
807
808    #[test]
809    fn test_interned_string_hashing() {
810        use std::collections::HashMap;
811
812        let s1 = InternedString::new("test");
813        let s2 = InternedString::new("test");
814        let s3 = InternedString::new("different");
815
816        let mut map = HashMap::new();
817        map.insert(s1.clone(), "value1");
818        map.insert(s3.clone(), "value2");
819
820        // s2 should map to the same value as s1 since they're equal
821        assert_eq!(map.get(&s2), Some(&"value1"));
822        assert_eq!(map.get(&s3), Some(&"value2"));
823        assert_eq!(map.len(), 2);
824    }
825
826    #[test]
827    fn test_global_interners() {
828        let iri1 = InternedString::new("http://example.org/test");
829        let iri2 = InternedString::new("http://example.org/test");
830
831        let datatype1 = InternedString::new_datatype("http://www.w3.org/2001/XMLSchema#string");
832        let datatype2 = InternedString::new_datatype("http://www.w3.org/2001/XMLSchema#string");
833
834        let lang1 = InternedString::new_language("en");
835        let lang2 = InternedString::new_language("en");
836
837        // Verify that equal strings are interned
838        assert_eq!(iri1, iri2);
839        assert_eq!(datatype1, datatype2);
840        assert_eq!(lang1, lang2);
841    }
842
843    #[test]
844    fn test_rdf_vocabulary() {
845        let string_type = InternedString::xsd_string();
846        let integer_type = InternedString::xsd_integer();
847        let rdf_type = InternedString::rdf_type();
848
849        assert_eq!(
850            string_type.as_str(),
851            "http://www.w3.org/2001/XMLSchema#string"
852        );
853        assert_eq!(
854            integer_type.as_str(),
855            "http://www.w3.org/2001/XMLSchema#integer"
856        );
857        assert_eq!(
858            rdf_type.as_str(),
859            "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
860        );
861
862        // Test that repeated calls return interned strings
863        let string_type2 = InternedString::xsd_string();
864        assert_eq!(string_type, string_type2);
865    }
866
867    #[test]
868    fn test_interned_string_display() {
869        let s = InternedString::new("http://example.org/test");
870        assert_eq!(format!("{s}"), "http://example.org/test");
871    }
872
873    #[test]
874    fn test_interned_string_deref() {
875        let s = InternedString::new("test");
876        assert_eq!(&*s, "test");
877        assert_eq!(s.len(), 4);
878        assert!(s.starts_with("te"));
879    }
880
881    #[test]
882    fn test_interned_string_conversions() {
883        let s1 = InternedString::from("test");
884        let s2 = InternedString::from("test".to_string());
885
886        assert_eq!(s1, s2);
887        assert_eq!(s1.as_str(), "test");
888    }
889
890    #[test]
891    fn test_concurrent_interning() {
892        use std::sync::Arc;
893        use std::thread;
894
895        let interner = Arc::new(StringInterner::new());
896        let handles: Vec<_> = (0..10)
897            .map(|i| {
898                let interner = Arc::clone(&interner);
899                thread::spawn(move || {
900                    let s = format!("http://example.org/test{}", i % 3);
901                    (0..100).map(|_| interner.intern(&s)).collect::<Vec<_>>()
902                })
903            })
904            .collect();
905
906        let results: Vec<Vec<Arc<str>>> = handles.into_iter().map(|h| h.join().unwrap()).collect();
907
908        // Verify that all equal strings are the same Arc
909        for result_set in &results {
910            for (i, s1) in result_set.iter().enumerate() {
911                for s2 in &result_set[i + 1..] {
912                    if s1.as_ref() == s2.as_ref() {
913                        assert!(Arc::ptr_eq(s1, s2));
914                    }
915                }
916            }
917        }
918
919        // Should have at most 3 unique strings (test0, test1, test2)
920        assert!(interner.len() <= 3);
921    }
922
923    #[test]
924    fn test_term_id_mapping() {
925        let interner = StringInterner::new();
926
927        // Test interning with ID
928        let (arc1, id1) = interner.intern_with_id("test_string");
929        let (arc2, id2) = interner.intern_with_id("test_string");
930
931        // Same string should get same ID
932        assert_eq!(id1, id2);
933        assert!(Arc::ptr_eq(&arc1, &arc2));
934
935        // Different strings should get different IDs
936        let (arc3, id3) = interner.intern_with_id("different_string");
937        assert_ne!(id1, id3);
938        assert!(!Arc::ptr_eq(&arc1, &arc3));
939
940        // Test ID lookup
941        assert_eq!(interner.get_id("test_string"), Some(id1));
942        assert_eq!(interner.get_id("different_string"), Some(id3));
943        assert_eq!(interner.get_id("nonexistent"), None);
944
945        // Test string lookup
946        assert_eq!(interner.get_string(id1).unwrap().as_ref(), "test_string");
947        assert_eq!(
948            interner.get_string(id3).unwrap().as_ref(),
949            "different_string"
950        );
951        assert_eq!(interner.get_string(999), None);
952    }
953
954    #[test]
955    fn test_id_mapping_stats() {
956        let interner = StringInterner::new();
957
958        assert_eq!(interner.id_mapping_count(), 0);
959
960        interner.intern_with_id("string1");
961        assert_eq!(interner.id_mapping_count(), 1);
962
963        interner.intern_with_id("string2");
964        assert_eq!(interner.id_mapping_count(), 2);
965
966        // Interning same string again shouldn't increase count
967        interner.intern_with_id("string1");
968        assert_eq!(interner.id_mapping_count(), 2);
969    }
970
971    #[test]
972    fn test_get_all_mappings() {
973        let interner = StringInterner::new();
974
975        let (_, id1) = interner.intern_with_id("first");
976        let (_, id2) = interner.intern_with_id("second");
977        let (_, id3) = interner.intern_with_id("third");
978
979        let mappings = interner.get_all_mappings();
980        assert_eq!(mappings.len(), 3);
981
982        // Verify all mappings are present
983        let mut found_ids = [false; 3];
984        for (id, string) in mappings {
985            match string.as_ref() {
986                "first" => {
987                    assert_eq!(id, id1);
988                    found_ids[0] = true;
989                }
990                "second" => {
991                    assert_eq!(id, id2);
992                    found_ids[1] = true;
993                }
994                "third" => {
995                    assert_eq!(id, id3);
996                    found_ids[2] = true;
997                }
998                _ => panic!("Unexpected string in mappings"),
999            }
1000        }
1001        assert!(found_ids.iter().all(|&found| found));
1002    }
1003
1004    #[test]
1005    fn test_mixed_interning_modes() {
1006        let interner = StringInterner::new();
1007
1008        // Mix regular interning and ID interning
1009        let arc1 = interner.intern("regular");
1010        let (_arc2, id2) = interner.intern_with_id("with_id");
1011        let arc3 = interner.intern("regular"); // Same as first
1012
1013        // Regular interning should still work
1014        assert!(Arc::ptr_eq(&arc1, &arc3));
1015
1016        // ID interning should work independently
1017        assert_eq!(interner.get_string(id2).unwrap().as_ref(), "with_id");
1018
1019        // Mixed mode length reporting should work
1020        assert!(interner.len() >= 2);
1021    }
1022}