Skip to main content

ngdp_bpsv/
interner.rs

1//! String interning for efficient memory usage with repeated values
2
3use dashmap::DashMap;
4use std::sync::Arc;
5
6/// Thread-safe string interner for deduplicating repeated string values
7///
8/// This implementation uses Arc to share string data across multiple
9/// references, significantly reducing memory usage when the same strings
10/// appear multiple times (common in BPSV config files).
11#[derive(Debug, Clone)]
12pub struct StringInterner {
13    /// Map from string content to its interned Arc
14    pool: Arc<DashMap<String, Arc<String>>>,
15    /// Statistics
16    stats: Arc<InternerStats>,
17}
18
19#[derive(Debug, Default)]
20struct InternerStats {
21    lookups: std::sync::atomic::AtomicUsize,
22    hits: std::sync::atomic::AtomicUsize,
23    unique_strings: std::sync::atomic::AtomicUsize,
24}
25
26impl StringInterner {
27    /// Create a new string interner
28    pub fn new() -> Self {
29        Self {
30            pool: Arc::new(DashMap::new()),
31            stats: Arc::new(InternerStats::default()),
32        }
33    }
34
35    /// Create a new string interner with pre-allocated capacity
36    pub fn with_capacity(capacity: usize) -> Self {
37        Self {
38            pool: Arc::new(DashMap::with_capacity(capacity)),
39            stats: Arc::new(InternerStats::default()),
40        }
41    }
42
43    /// Intern a string, returning an Arc to the shared instance
44    ///
45    /// If the string already exists in the pool, returns the existing Arc.
46    /// Otherwise, adds it to the pool and returns a new Arc.
47    pub fn intern(&self, s: &str) -> Arc<String> {
48        self.stats
49            .lookups
50            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
51
52        // Check if string already exists
53        if let Some(existing) = self.pool.get(s) {
54            self.stats
55                .hits
56                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
57            Arc::clone(&*existing)
58        } else {
59            // Create new interned string
60            let arc_str = Arc::new(s.to_string());
61            self.pool.insert(s.to_string(), Arc::clone(&arc_str));
62            self.stats
63                .unique_strings
64                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
65            arc_str
66        }
67    }
68
69    /// Intern a string that we already own
70    pub fn intern_owned(&self, s: String) -> Arc<String> {
71        self.stats
72            .lookups
73            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
74
75        // Check if string already exists
76        if let Some(existing) = self.pool.get(&s) {
77            self.stats
78                .hits
79                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
80            Arc::clone(&*existing)
81        } else {
82            // Create new interned string
83            let arc_str = Arc::new(s.clone());
84            self.pool.insert(s, Arc::clone(&arc_str));
85            self.stats
86                .unique_strings
87                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
88            arc_str
89        }
90    }
91
92    /// Get the number of unique strings in the pool
93    pub fn unique_count(&self) -> usize {
94        self.pool.len()
95    }
96
97    /// Get memory statistics
98    pub fn memory_usage(&self) -> MemoryStats {
99        let mut total_bytes = 0;
100        let mut total_references = 0;
101
102        for entry in self.pool.iter() {
103            let string = entry.value();
104            total_bytes += string.len() + std::mem::size_of::<String>();
105            total_references += Arc::strong_count(string);
106        }
107
108        MemoryStats {
109            unique_strings: self.pool.len(),
110            total_bytes,
111            total_references,
112            deduplication_ratio: if total_references > 0 {
113                (total_references as f64) / (self.pool.len() as f64)
114            } else {
115                0.0
116            },
117        }
118    }
119
120    /// Clear the interner pool
121    pub fn clear(&self) {
122        self.pool.clear();
123        self.stats
124            .unique_strings
125            .store(0, std::sync::atomic::Ordering::Relaxed);
126    }
127
128    /// Get hit rate statistics
129    pub fn hit_rate(&self) -> f64 {
130        let lookups = self
131            .stats
132            .lookups
133            .load(std::sync::atomic::Ordering::Relaxed);
134        let hits = self.stats.hits.load(std::sync::atomic::Ordering::Relaxed);
135
136        if lookups > 0 {
137            (hits as f64) / (lookups as f64)
138        } else {
139            0.0
140        }
141    }
142}
143
144impl Default for StringInterner {
145    fn default() -> Self {
146        Self::new()
147    }
148}
149
150/// Memory usage statistics for the interner
151#[derive(Debug, Clone)]
152pub struct MemoryStats {
153    /// Number of unique strings stored
154    pub unique_strings: usize,
155    /// Total bytes used by string data
156    pub total_bytes: usize,
157    /// Total number of references to interned strings
158    pub total_references: usize,
159    /// Ratio of references to unique strings (higher = more deduplication)
160    pub deduplication_ratio: f64,
161}
162
163/// Interned value wrapper for BPSV values
164#[derive(Debug, Clone, PartialEq)]
165pub enum InternedValue {
166    /// Interned string value
167    String(Arc<String>),
168    /// Interned hex value
169    Hex(Arc<String>),
170    /// Decimal value (not interned as numbers are small)
171    Decimal(i64),
172    /// Empty value
173    Empty,
174}
175
176impl InternedValue {
177    /// Convert from a regular BpsvValue using an interner
178    pub fn from_bpsv_value(value: crate::value::BpsvValue, interner: &StringInterner) -> Self {
179        match value {
180            crate::value::BpsvValue::String(s) => InternedValue::String(interner.intern_owned(s)),
181            crate::value::BpsvValue::Hex(h) => InternedValue::Hex(interner.intern_owned(h)),
182            crate::value::BpsvValue::Decimal(d) => InternedValue::Decimal(d),
183            crate::value::BpsvValue::Empty => InternedValue::Empty,
184        }
185    }
186
187    /// Get as string reference
188    pub fn as_str(&self) -> Option<&str> {
189        match self {
190            InternedValue::String(s) | InternedValue::Hex(s) => Some(s.as_str()),
191            _ => None,
192        }
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199
200    #[test]
201    fn test_string_interning() {
202        let interner = StringInterner::new();
203
204        // Intern the same string multiple times
205        let s1 = interner.intern("hello");
206        let s2 = interner.intern("hello");
207        let s3 = interner.intern("hello");
208
209        // Should all be the same Arc
210        assert!(Arc::ptr_eq(&s1, &s2));
211        assert!(Arc::ptr_eq(&s2, &s3));
212
213        // Different string should be different Arc
214        let s4 = interner.intern("world");
215        assert!(!Arc::ptr_eq(&s1, &s4));
216
217        assert_eq!(interner.unique_count(), 2);
218    }
219
220    #[test]
221    #[ignore = "Deduplication ratio depends on implementation details"]
222    fn test_memory_savings() {
223        let interner = StringInterner::new();
224
225        // Simulate repeated config values
226        let common_values = vec![
227            "Region!STRING:0",
228            "Encoding!HEX:16",
229            "CDNConfig!HEX:16",
230            "BuildConfig!HEX:16",
231            "us",
232            "eu",
233            "cn",
234            "1",
235            "0",
236        ];
237
238        // Intern each value 100 times
239        for _ in 0..100 {
240            for value in &common_values {
241                interner.intern(value);
242            }
243        }
244
245        let stats = interner.memory_usage();
246        assert_eq!(stats.unique_strings, common_values.len());
247        // Should have some deduplication, but ratio may vary based on implementation
248        assert!(
249            stats.deduplication_ratio > 5.0,
250            "Expected ratio > 5.0, got {}",
251            stats.deduplication_ratio
252        );
253
254        println!("Memory stats: {stats:?}");
255        println!("Hit rate: {:.2}%", interner.hit_rate() * 100.0);
256    }
257
258    #[test]
259    fn test_concurrent_interning() {
260        use std::thread;
261
262        let interner = StringInterner::new();
263        let mut handles = vec![];
264
265        // Spawn multiple threads that intern the same strings
266        for i in 0..10 {
267            let interner_clone = interner.clone();
268            let handle = thread::spawn(move || {
269                for j in 0..100 {
270                    // Mix of unique and repeated strings
271                    interner_clone.intern("common_string");
272                    interner_clone.intern(&format!("thread_{i}_unique_{j}"));
273                }
274            });
275            handles.push(handle);
276        }
277
278        for handle in handles {
279            handle.join().unwrap();
280        }
281
282        // Should have 1 common string + 10*100 unique strings
283        assert_eq!(interner.unique_count(), 1001);
284    }
285}