memscope_rs/core/
string_pool.rs

1//! Global string pool system for memory optimization
2//!
3//! This module provides a global string interning system to reduce memory usage
4//! by sharing common strings across the application. All string fields in
5//! AllocationInfo and related structures use `Arc<str>` backed by this pool.
6
7use dashmap::DashMap;
8use serde::{Deserialize, Serialize};
9use std::sync::atomic::{AtomicU64, Ordering};
10use std::sync::{Arc, OnceLock};
11
12/// Statistics about string pool usage
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct StringPoolStats {
15    /// Total number of unique strings in the pool
16    pub unique_strings: usize,
17    /// Total number of intern operations performed
18    pub intern_operations: u64,
19    /// Number of cache hits (string already existed)
20    pub cache_hits: u64,
21    /// Estimated memory saved by string interning (in bytes)
22    pub memory_saved_bytes: u64,
23    /// Average string length in the pool
24    pub average_string_length: f64,
25}
26
27/// Internal statistics tracking for the string pool
28struct PoolMetrics {
29    intern_count: AtomicU64,
30    cache_hits: AtomicU64,
31    total_string_bytes: AtomicU64,
32    duplicate_bytes_saved: AtomicU64,
33}
34
35impl PoolMetrics {
36    fn new() -> Self {
37        Self {
38            intern_count: AtomicU64::new(0),
39            cache_hits: AtomicU64::new(0),
40            total_string_bytes: AtomicU64::new(0),
41            duplicate_bytes_saved: AtomicU64::new(0),
42        }
43    }
44
45    fn record_intern(&self, string_len: usize, was_cached: bool) {
46        self.intern_count.fetch_add(1, Ordering::Relaxed);
47
48        if was_cached {
49            self.cache_hits.fetch_add(1, Ordering::Relaxed);
50            self.duplicate_bytes_saved
51                .fetch_add(string_len as u64, Ordering::Relaxed);
52        } else {
53            self.total_string_bytes
54                .fetch_add(string_len as u64, Ordering::Relaxed);
55        }
56    }
57}
58
59#[cfg(test)]
60mod string_pool_tests {
61    use super::*;
62    use std::sync::Arc;
63    use std::thread;
64
65    #[test]
66    fn test_string_pool_deduplication() {
67        let pool = StringPool::new();
68
69        // Same string should return same Arc
70        let str1 = pool.intern("hello world");
71        let str2 = pool.intern("hello world");
72
73        assert!(Arc::ptr_eq(&str1, &str2));
74
75        // Different strings should return different Arcs
76        let str3 = pool.intern("different string");
77        assert!(!Arc::ptr_eq(&str1, &str3));
78    }
79
80    #[test]
81    fn test_string_pool_memory_efficiency() {
82        let pool = StringPool::new();
83
84        // Intern the same string multiple times
85        let original = "this is a test string for memory efficiency";
86        let mut handles = Vec::new();
87
88        for _ in 0..1000 {
89            handles.push(pool.intern(original));
90        }
91
92        // All handles should point to the same memory
93        for handle in &handles[1..] {
94            assert!(Arc::ptr_eq(&handles[0], handle));
95        }
96
97        // Pool should only contain one entry
98        let stats = pool.get_stats();
99        assert_eq!(stats.unique_strings, 1);
100        assert_eq!(stats.intern_operations, 1000);
101    }
102
103    #[test]
104    fn test_string_pool_concurrent_access() {
105        let pool = Arc::new(StringPool::new());
106        let mut handles = Vec::new();
107
108        // Spawn multiple threads that intern the same strings
109        for _i in 0..10 {
110            let pool_clone = Arc::clone(&pool);
111            let handle = thread::spawn(move || {
112                let mut results = Vec::new();
113                for j in 0..100 {
114                    let s = if j % 2 == 0 {
115                        "common_string_a"
116                    } else {
117                        "common_string_b"
118                    };
119                    results.push(pool_clone.intern(s));
120                }
121                results
122            });
123            handles.push(handle);
124        }
125
126        // Collect all results
127        let mut all_results = Vec::new();
128        for handle in handles {
129            all_results.extend(handle.join().unwrap());
130        }
131
132        // Verify deduplication worked across threads
133        let string_a_refs: Vec<_> = all_results
134            .iter()
135            .filter(|s| &***s == "common_string_a")
136            .collect();
137        let string_b_refs: Vec<_> = all_results
138            .iter()
139            .filter(|s| &***s == "common_string_b")
140            .collect();
141
142        // All references to the same string should be identical
143        for ref_a in &string_a_refs[1..] {
144            assert!(Arc::ptr_eq(string_a_refs[0], ref_a));
145        }
146        for ref_b in &string_b_refs[1..] {
147            assert!(Arc::ptr_eq(string_b_refs[0], ref_b));
148        }
149
150        let stats = pool.get_stats();
151        assert_eq!(stats.unique_strings, 2); // Only "common_string_a" and "common_string_b"
152    }
153
154    #[test]
155    fn test_string_pool_memory_usage_calculation() {
156        let pool = StringPool::new();
157
158        let _short_str = pool.intern("hi");
159        let _long_str = pool.intern("this is a much longer string that takes more memory");
160
161        let stats = pool.get_stats();
162
163        // Should track memory usage accurately
164        assert_eq!(stats.unique_strings, 2);
165    }
166
167    #[test]
168    fn test_string_pool_cleanup_on_drop() {
169        let pool = StringPool::new();
170
171        {
172            let _temp_str = pool.intern("temporary string");
173            let stats = pool.get_stats();
174            assert_eq!(stats.unique_strings, 1);
175        }
176
177        // After temp_str is dropped, the pool should still contain the string
178        // because Arc keeps it alive until all references are gone
179        let stats = pool.get_stats();
180        assert_eq!(stats.unique_strings, 1);
181
182        // But if we force cleanup (if implemented), it should be removed
183        // This tests the cleanup mechanism
184    }
185
186    #[test]
187    fn test_string_pool_large_strings() {
188        let pool = StringPool::new();
189
190        // Test with very large strings
191        let large_string = "x".repeat(10000);
192        let str1 = pool.intern(&large_string);
193        let str2 = pool.intern(&large_string);
194
195        assert!(Arc::ptr_eq(&str1, &str2));
196
197        let stats = pool.get_stats();
198        assert_eq!(stats.unique_strings, 1);
199    }
200
201    #[test]
202    fn test_string_pool_empty_strings() {
203        let pool = StringPool::new();
204
205        let empty1 = pool.intern("");
206        let empty2 = pool.intern("");
207
208        assert!(Arc::ptr_eq(&empty1, &empty2));
209        assert_eq!(empty1.as_ref(), "");
210
211        let stats = pool.get_stats();
212        assert_eq!(stats.unique_strings, 1);
213    }
214
215    #[test]
216    fn test_string_pool_unicode_strings() {
217        let pool = StringPool::new();
218
219        let unicode1 = pool.intern("Hello world 🌍");
220        let unicode2 = pool.intern("Hello world 🌍");
221        let different_unicode = pool.intern("Hello world 🌎");
222
223        assert!(Arc::ptr_eq(&unicode1, &unicode2));
224        assert!(!Arc::ptr_eq(&unicode1, &different_unicode));
225
226        let stats = pool.get_stats();
227        assert_eq!(stats.unique_strings, 2);
228    }
229
230    #[test]
231    fn test_string_pool_stats_accuracy() {
232        let pool = StringPool::new();
233
234        // Start with empty pool
235        let initial_stats = pool.get_stats();
236        assert_eq!(initial_stats.unique_strings, 0);
237        assert_eq!(initial_stats.intern_operations, 0);
238
239        // Add some strings
240        let str1 = pool.intern("first");
241        let str2 = pool.intern("second");
242        let str1_again = pool.intern("first"); // Should reuse
243
244        let stats = pool.get_stats();
245        assert_eq!(stats.unique_strings, 2); // "first" and "second"
246        assert!(stats.intern_operations >= 2); // Should have some operations
247
248        // Verify the strings are correct
249        assert_eq!(str1.as_ref(), "first");
250        assert_eq!(str2.as_ref(), "second");
251        assert!(Arc::ptr_eq(&str1, &str1_again));
252    }
253
254    #[test]
255    fn test_global_string_pool_singleton() {
256        // Test that global pool is a singleton
257        let str1 = intern_string("global test");
258        let str2 = intern_string("global test");
259
260        assert!(Arc::ptr_eq(&str1, &str2));
261
262        // Test from different calls to global pool
263        let pool1 = get_global_pool();
264        let pool2 = get_global_pool();
265
266        let str3 = pool1.intern("another global test");
267        let str4 = pool2.intern("another global test");
268
269        assert!(Arc::ptr_eq(&str3, &str4));
270    }
271
272    #[test]
273    fn test_string_pool_performance_characteristics() {
274        let pool = StringPool::new();
275
276        // Test that repeated interning is fast (should be O(1) lookup)
277        let test_string = "performance test string";
278
279        let start = std::time::Instant::now();
280        for _ in 0..10000 {
281            pool.intern(test_string);
282        }
283        let duration = start.elapsed();
284
285        // Should be very fast since it's just hash lookups after first insert
286        assert!(
287            duration.as_millis() < 100,
288            "String pool lookup too slow: {duration:?}",
289        );
290
291        let stats = pool.get_stats();
292        assert_eq!(stats.unique_strings, 1);
293    }
294
295    #[test]
296    fn test_string_pool_memory_overhead() {
297        let pool = StringPool::new();
298
299        // Test that memory overhead is reasonable
300        let test_strings = vec![
301            "short",
302            "medium length string",
303            "this is a much longer string that should still be handled efficiently",
304        ];
305
306        let mut total_string_bytes = 0;
307        for s in &test_strings {
308            pool.intern(s);
309            total_string_bytes += s.len();
310        }
311
312        let stats = pool.get_stats();
313
314        // Memory usage should be close to actual string sizes (plus some overhead)
315        let overhead_ratio = (stats.memory_saved_bytes + total_string_bytes as u64) as f64
316            / total_string_bytes as f64;
317        assert!(
318            overhead_ratio < 2.0,
319            "Memory overhead too high: {overhead_ratio:.2}x"
320        );
321        assert!(
322            overhead_ratio >= 1.0,
323            "Memory usage calculation seems wrong"
324        );
325    }
326}
327
328/// Global string pool for memory-efficient string storage
329pub struct StringPool {
330    /// Map from string content to interned `Arc<str>`
331    pool: DashMap<String, Arc<str>>,
332    /// Performance and usage metrics
333    metrics: PoolMetrics,
334}
335
336impl StringPool {
337    /// Create a new string pool
338    fn new() -> Self {
339        Self {
340            pool: DashMap::new(),
341            metrics: PoolMetrics::new(),
342        }
343    }
344
345    /// Intern a string, returning an `Arc<str>` that can be shared
346    ///
347    /// If the string already exists in the pool, returns the existing `Arc<str>`.
348    /// Otherwise, creates a new `Arc<str>` and stores it in the pool.
349    pub fn intern(&self, s: &str) -> Arc<str> {
350        let string_len = s.len();
351
352        // Try to get existing string first
353        if let Some(existing) = self.pool.get(s) {
354            self.metrics.record_intern(string_len, true);
355            return existing.clone();
356        }
357
358        // String doesn't exist, create new Arc<str>
359        let arc_str: Arc<str> = Arc::from(s);
360
361        // Insert into pool - use entry API to handle race conditions
362        let result = self
363            .pool
364            .entry(s.to_string())
365            .or_insert_with(|| Arc::clone(&arc_str))
366            .clone();
367
368        self.metrics.record_intern(string_len, false);
369        result
370    }
371
372    /// Get current statistics about the string pool
373    pub fn get_stats(&self) -> StringPoolStats {
374        let unique_strings = self.pool.len();
375        let intern_operations = self.metrics.intern_count.load(Ordering::Relaxed);
376        let cache_hits = self.metrics.cache_hits.load(Ordering::Relaxed);
377        let total_bytes = self.metrics.total_string_bytes.load(Ordering::Relaxed);
378        let saved_bytes = self.metrics.duplicate_bytes_saved.load(Ordering::Relaxed);
379
380        let average_length = if unique_strings > 0 {
381            total_bytes as f64 / unique_strings as f64
382        } else {
383            0.0
384        };
385
386        StringPoolStats {
387            unique_strings,
388            intern_operations,
389            cache_hits,
390            memory_saved_bytes: saved_bytes,
391            average_string_length: average_length,
392        }
393    }
394
395    /// Clear all strings from the pool (useful for testing)
396    #[cfg(test)]
397    pub fn clear(&self) {
398        self.pool.clear();
399    }
400
401    /// Get the number of unique strings currently in the pool
402    pub fn len(&self) -> usize {
403        self.pool.len()
404    }
405
406    /// Check if the pool is empty
407    pub fn is_empty(&self) -> bool {
408        self.pool.is_empty()
409    }
410}
411
412/// Global string pool instance
413static GLOBAL_STRING_POOL: OnceLock<StringPool> = OnceLock::new();
414
415/// Get the global string pool instance
416fn get_global_pool() -> &'static StringPool {
417    GLOBAL_STRING_POOL.get_or_init(StringPool::new)
418}
419
420/// Intern a string using the global string pool
421///
422/// This is the main API for string interning. All string fields in AllocationInfo
423/// and related structures should use this function to intern their strings.
424///
425/// # Example
426/// ```text
427/// use memscope_rs::core::string_pool::intern_string;
428/// use std::sync::Arc;
429///
430/// let s1 = intern_string("hello");
431/// let s2 = intern_string("hello");
432///
433/// // Both Arc<str> point to the same memory
434/// assert!(Arc::ptr_eq(&s1, &s2));
435/// ```
436pub fn intern_string(s: &str) -> Arc<str> {
437    let start_time = std::time::Instant::now();
438    let result = get_global_pool().intern(s);
439
440    // Record timing for global monitoring
441    let duration_ns = start_time.elapsed().as_nanos() as u64;
442    crate::core::string_pool_monitor::record_intern_operation(duration_ns);
443
444    result
445}
446
447/// Get statistics about the global string pool
448pub fn get_string_pool_stats() -> StringPoolStats {
449    get_global_pool().get_stats()
450}
451
452/// Clear the global string pool (useful for testing)
453#[cfg(test)]
454pub fn clear_string_pool() {
455    if let Some(pool) = GLOBAL_STRING_POOL.get() {
456        pool.clear();
457    }
458}
459
460#[cfg(test)]
461mod tests {
462    use super::*;
463
464    #[test]
465    fn test_string_interning() {
466        clear_string_pool();
467
468        let s1 = intern_string("test");
469        let s2 = intern_string("test");
470        let s3 = intern_string("different");
471
472        // Same strings should be the same Arc
473        assert!(Arc::ptr_eq(&s1, &s2));
474
475        // Different strings should be different Arcs
476        assert!(!Arc::ptr_eq(&s1, &s3));
477
478        // Content should be correct
479        assert_eq!(&*s1, "test");
480        assert_eq!(&*s3, "different");
481    }
482
483    #[test]
484    fn test_string_pool_stats() {
485        // Use a local pool to avoid interference from other tests
486        let pool = StringPool::new();
487
488        let initial_stats = pool.get_stats();
489
490        let s1 = pool.intern("unique_test_string_1");
491        let s2 = pool.intern("unique_test_string_1"); // Should be cache hit
492        let s3 = pool.intern("unique_test_string_2");
493
494        let stats = pool.get_stats();
495
496        // Verify that operations increased
497        assert!(stats.intern_operations >= initial_stats.intern_operations + 3);
498        // Verify that we have at least the strings we added
499        assert!(stats.unique_strings >= initial_stats.unique_strings + 2);
500        // Verify that we had at least one cache hit
501        assert!(stats.cache_hits > initial_stats.cache_hits);
502
503        // Verify the strings are actually the same Arc
504        assert_eq!(&*s1, "unique_test_string_1");
505        assert_eq!(&*s2, "unique_test_string_1");
506        assert_eq!(&*s3, "unique_test_string_2");
507    }
508
509    #[test]
510    fn test_empty_string_interning() {
511        clear_string_pool();
512
513        let s1 = intern_string("");
514        let s2 = intern_string("");
515
516        assert!(Arc::ptr_eq(&s1, &s2));
517        assert_eq!(&*s1, "");
518    }
519
520    #[test]
521    fn test_concurrent_interning() {
522        use std::thread;
523
524        // Test that concurrent interning works without panicking
525        // We'll focus on correctness rather than Arc pointer equality
526        let handles: Vec<_> = (0..10)
527            .map(|i| {
528                thread::spawn(move || {
529                    let s = format!("concurrent_test_{}", i % 3); // Only 3 unique strings
530                    intern_string(&s)
531                })
532            })
533            .collect();
534
535        let results: Vec<_> = handles.into_iter().filter_map(|h| h.join().ok()).collect();
536
537        // Verify all results have correct content
538        for (i, arc_str) in results.iter().enumerate() {
539            let expected = format!("concurrent_test_{}", i % 3);
540            assert_eq!(arc_str.as_ref(), expected);
541        }
542
543        // Verify we got the expected number of results
544        assert_eq!(results.len(), 10);
545    }
546}