formualizer_eval/engine/arena/
string_interner.rs

1/// String interning for deduplication of text values and identifiers
2/// Uses FxHashMap for fast lookups and Box<str> to minimize allocations
3use rustc_hash::FxHashMap;
4use std::{fmt, sync::Arc};
5
6/// Reference to an interned string
7#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
8pub struct StringId(u32);
9
10impl StringId {
11    /// Maximum number of strings that can be interned (2^32 - 1)
12    pub const MAX: u32 = u32::MAX - 1;
13
14    /// Invalid/null string ID
15    pub const INVALID: StringId = StringId(u32::MAX);
16
17    pub fn as_u32(self) -> u32 {
18        self.0
19    }
20
21    pub fn from_raw(raw: u32) -> Self {
22        StringId(raw)
23    }
24
25    pub fn is_valid(self) -> bool {
26        self.0 != u32::MAX
27    }
28}
29
30impl fmt::Display for StringId {
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        write!(f, "StringId({})", self.0)
33    }
34}
35
36/// String interner for deduplicating strings
37#[derive(Debug)]
38pub struct StringInterner {
39    /// Storage for interned strings
40    strings: Vec<Arc<str>>,
41    /// Map from string content to ID for deduplication
42    lookup: FxHashMap<Arc<str>, StringId>,
43}
44
45impl StringInterner {
46    pub fn new() -> Self {
47        Self {
48            strings: Vec::new(),
49            lookup: FxHashMap::default(),
50        }
51    }
52
53    pub fn with_capacity(cap: usize) -> Self {
54        Self {
55            strings: Vec::with_capacity(cap),
56            lookup: FxHashMap::with_capacity_and_hasher(cap, Default::default()),
57        }
58    }
59
60    /// Intern a string, returning its ID
61    /// If the string is already interned, returns the existing ID
62    pub fn intern(&mut self, s: &str) -> StringId {
63        // Check if already interned
64        if let Some(&id) = self.lookup.get(s) {
65            return id;
66        }
67
68        // Check for overflow
69        let index = self.strings.len() as u32;
70        if index > StringId::MAX {
71            panic!("String interner overflow: too many strings");
72        }
73
74        // Add new string
75        let id = StringId(index);
76        let boxed: Arc<str> = s.into();
77
78        // We need to clone the boxed string for the lookup key
79        // This is safe because Box<str> is cheap to clone (just pointer copy)
80        self.lookup.insert(boxed.clone(), id);
81        self.strings.push(boxed);
82
83        id
84    }
85
86    /// Get a string by its ID
87    #[inline]
88    pub fn get(&self, id: StringId) -> Option<&str> {
89        if id.is_valid() {
90            self.strings.get(id.0 as usize).map(|s| &**s)
91        } else {
92            None
93        }
94    }
95
96    /// Get a string by its ID, panicking if invalid
97    #[inline]
98    pub fn resolve(&self, id: StringId) -> &str {
99        self.get(id)
100            .unwrap_or_else(|| panic!("Invalid string ID: {id:?}"))
101    }
102
103    /// Check if a string is already interned
104    pub fn contains(&self, s: &str) -> bool {
105        self.lookup.contains_key(s)
106    }
107
108    /// Get the ID of an already-interned string without interning
109    pub fn get_id(&self, s: &str) -> Option<StringId> {
110        self.lookup.get(s).copied()
111    }
112
113    /// Returns the number of interned strings
114    pub fn len(&self) -> usize {
115        self.strings.len()
116    }
117
118    /// Returns true if no strings are interned
119    pub fn is_empty(&self) -> bool {
120        self.strings.is_empty()
121    }
122
123    /// Returns memory usage in bytes (approximate)
124    pub fn memory_usage(&self) -> usize {
125        // Vector overhead
126        let vec_overhead = self.strings.capacity() * std::mem::size_of::<Box<str>>();
127
128        // String data
129        let string_data: usize = self
130            .strings
131            .iter()
132            .map(|s| s.len() + std::mem::size_of::<Box<str>>())
133            .sum();
134
135        // HashMap overhead (approximate)
136        let map_overhead = self.lookup.capacity()
137            * (std::mem::size_of::<Box<str>>() + std::mem::size_of::<StringId>());
138
139        vec_overhead + string_data + map_overhead
140    }
141
142    /// Clear all interned strings
143    pub fn clear(&mut self) {
144        self.strings.clear();
145        self.lookup.clear();
146    }
147
148    /// Iterate over all interned strings with their IDs
149    pub fn iter(&self) -> impl Iterator<Item = (StringId, &str)> + '_ {
150        self.strings
151            .iter()
152            .enumerate()
153            .map(|(i, s)| (StringId(i as u32), &**s))
154    }
155
156    /// Get statistics about the interner
157    pub fn stats(&self) -> InternerStats {
158        let total_bytes: usize = self.strings.iter().map(|s| s.len()).sum();
159        let unique_count = self.strings.len();
160
161        InternerStats {
162            unique_count,
163            total_bytes,
164            average_length: if unique_count > 0 {
165                total_bytes as f64 / unique_count as f64
166            } else {
167                0.0
168            },
169        }
170    }
171}
172
173impl Default for StringInterner {
174    fn default() -> Self {
175        Self::new()
176    }
177}
178
179/// Statistics about the string interner
180#[derive(Debug, Clone)]
181pub struct InternerStats {
182    pub unique_count: usize,
183    pub total_bytes: usize,
184    pub average_length: f64,
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    #[test]
192    fn test_string_interning() {
193        let mut interner = StringInterner::new();
194
195        let s1 = interner.intern("Hello");
196        let s2 = interner.intern("Hello");
197        let s3 = interner.intern("World");
198
199        assert_eq!(s1, s2);
200        assert_ne!(s1, s3);
201        assert_eq!(interner.get(s1), Some("Hello"));
202        assert_eq!(interner.get(s3), Some("World"));
203    }
204
205    #[test]
206    fn test_string_memory_efficiency() {
207        let mut interner = StringInterner::new();
208
209        // Intern same string 1000 times
210        let refs: Vec<_> = (0..1000).map(|_| interner.intern("repeated")).collect();
211
212        // Should only store string once
213        assert_eq!(interner.len(), 1);
214
215        // All refs should be the same
216        for r in &refs[1..] {
217            assert_eq!(*r, refs[0]);
218        }
219
220        // Memory usage should be much less than 9000 bytes
221        assert!(interner.memory_usage() < 1000);
222    }
223
224    #[test]
225    fn test_string_interner_capacity() {
226        let mut interner = StringInterner::with_capacity(100);
227
228        // Intern many different strings
229        for i in 0..1000 {
230            let s = format!("string_{i}");
231            let id = interner.intern(&s);
232            assert_eq!(interner.get(id), Some(s.as_str()));
233        }
234
235        assert_eq!(interner.len(), 1000);
236    }
237
238    #[test]
239    fn test_string_interner_contains() {
240        let mut interner = StringInterner::new();
241
242        interner.intern("exists");
243
244        assert!(interner.contains("exists"));
245        assert!(!interner.contains("not_exists"));
246    }
247
248    #[test]
249    fn test_string_interner_get_id() {
250        let mut interner = StringInterner::new();
251
252        let id = interner.intern("test");
253
254        assert_eq!(interner.get_id("test"), Some(id));
255        assert_eq!(interner.get_id("not_interned"), None);
256    }
257
258    #[test]
259    fn test_string_interner_clear() {
260        let mut interner = StringInterner::new();
261
262        interner.intern("one");
263        interner.intern("two");
264        interner.intern("three");
265
266        assert_eq!(interner.len(), 3);
267
268        interner.clear();
269
270        assert_eq!(interner.len(), 0);
271        assert!(interner.is_empty());
272    }
273
274    #[test]
275    fn test_string_interner_iter() {
276        let mut interner = StringInterner::new();
277
278        let id1 = interner.intern("first");
279        let id2 = interner.intern("second");
280        let id3 = interner.intern("third");
281
282        let items: Vec<_> = interner.iter().collect();
283
284        assert_eq!(items.len(), 3);
285        assert_eq!(items[0], (id1, "first"));
286        assert_eq!(items[1], (id2, "second"));
287        assert_eq!(items[2], (id3, "third"));
288    }
289
290    #[test]
291    fn test_string_interner_stats() {
292        let mut interner = StringInterner::new();
293
294        interner.intern("short");
295        interner.intern("medium_length");
296        interner.intern("this_is_a_longer_string");
297
298        let stats = interner.stats();
299
300        assert_eq!(stats.unique_count, 3);
301        assert_eq!(stats.total_bytes, 5 + 13 + 23);
302        assert!((stats.average_length - 13.67).abs() < 0.01);
303    }
304
305    #[test]
306    fn test_invalid_string_id() {
307        let interner = StringInterner::new();
308
309        assert!(StringId::INVALID.0 == u32::MAX);
310        assert!(!StringId::INVALID.is_valid());
311        assert_eq!(interner.get(StringId::INVALID), None);
312    }
313
314    #[test]
315    #[should_panic(expected = "Invalid string ID")]
316    fn test_resolve_invalid_id() {
317        let interner = StringInterner::new();
318        interner.resolve(StringId::INVALID);
319    }
320
321    #[test]
322    fn test_string_id_ordering() {
323        let mut interner = StringInterner::new();
324
325        let id1 = interner.intern("a");
326        let id2 = interner.intern("b");
327        let id3 = interner.intern("c");
328
329        assert!(id1 < id2);
330        assert!(id2 < id3);
331        assert!(id1 < id3);
332    }
333
334    #[test]
335    fn test_empty_string() {
336        let mut interner = StringInterner::new();
337
338        let id = interner.intern("");
339        assert_eq!(interner.get(id), Some(""));
340
341        // Empty string should also be deduplicated
342        let id2 = interner.intern("");
343        assert_eq!(id, id2);
344    }
345
346    #[test]
347    fn test_unicode_strings() {
348        let mut interner = StringInterner::new();
349
350        let id1 = interner.intern("Hello δΈ–η•Œ");
351        let id2 = interner.intern("πŸ¦€ Rust");
352        let id3 = interner.intern("Hello δΈ–η•Œ");
353
354        assert_eq!(id1, id3);
355        assert_ne!(id1, id2);
356
357        assert_eq!(interner.get(id1), Some("Hello δΈ–η•Œ"));
358        assert_eq!(interner.get(id2), Some("πŸ¦€ Rust"));
359    }
360}