Skip to main content

ralph_workflow/checkpoint/
string_pool.rs

1//! String interning for deduplicating repeated strings in execution history.
2//!
3//! This module provides a string pool that deduplicates commonly repeated strings
4//! (like phase names and agent names) by storing them as Arc<str>. This reduces
5//! memory usage when the same strings appear many times across execution history.
6
7use std::collections::HashSet;
8use std::sync::Arc;
9
10/// String pool for deduplicating commonly repeated strings in execution history.
11///
12/// Phase names and agent names are repeated frequently across execution history
13/// entries. Using Arc<str> with a string pool reduces memory usage by sharing
14/// the same allocation for identical strings.
15///
16/// # Example
17///
18/// ```
19/// use ralph_workflow::checkpoint::string_pool::StringPool;
20/// use std::sync::Arc;
21///
22/// let mut pool = StringPool::new();
23/// let phase1 = pool.intern("Development");
24/// let phase2 = pool.intern("Development");
25///
26/// // Both Arc<str> values point to the same allocation
27/// assert!(Arc::ptr_eq(&phase1, &phase2));
28/// ```
29#[derive(Debug, Clone, Default)]
30pub struct StringPool {
31    // Store a single allocation per unique string (the Arc payload).
32    // Using `Arc<str>` as the set key enables cheap cloning and lookup by `&str`.
33    pool: HashSet<Arc<str>>,
34}
35
36impl StringPool {
37    /// Create a new string pool with default capacity hint.
38    ///
39    /// Pre-allocates capacity for 16 unique strings, which is typical for
40    /// most pipeline runs (phase names, agent names, step types).
41    #[must_use]
42    pub fn new() -> Self {
43        Self::with_capacity(16)
44    }
45
46    /// Create a string pool with specific capacity.
47    ///
48    /// Use this when you know the expected number of unique strings to avoid
49    /// hash table resizing during initial population.
50    #[must_use]
51    pub fn with_capacity(capacity: usize) -> Self {
52        Self {
53            pool: HashSet::with_capacity(capacity),
54        }
55    }
56
57    /// Get or insert a string slice into the pool, returning an Arc<str>.
58    ///
59    /// Prefer this when the input is already a `&str` to avoid allocating a
60    /// temporary `String` on repeated calls.
61    ///
62    /// # Example
63    ///
64    /// ```
65    /// use ralph_workflow::checkpoint::string_pool::StringPool;
66    ///
67    /// let mut pool = StringPool::new();
68    /// let s1 = pool.intern("test");
69    /// let s2 = pool.intern("test");
70    /// assert!(std::sync::Arc::ptr_eq(&s1, &s2));
71    /// ```
72    pub fn intern_str(&mut self, s: &str) -> Arc<str> {
73        if let Some(existing) = self.pool.get(s) {
74            return Arc::clone(existing);
75        }
76
77        let interned: Arc<str> = Arc::from(s);
78        self.pool.insert(Arc::clone(&interned));
79        interned
80    }
81
82    /// Get or insert an owned string into the pool, returning an Arc<str>.
83    ///
84    /// This path can reuse the allocation of the provided `String` when inserting.
85    pub fn intern_string(&mut self, s: String) -> Arc<str> {
86        if let Some(existing) = self.pool.get(s.as_str()) {
87            return Arc::clone(existing);
88        }
89
90        let interned: Arc<str> = Arc::from(s);
91        self.pool.insert(Arc::clone(&interned));
92        interned
93    }
94
95    /// Backward-compatible convenience: accepts any `Into<String>`.
96    ///
97    /// Note: callers passing `&str` should prefer `intern_str()` to avoid
98    /// allocating a temporary `String` on repeated lookups.
99    pub fn intern(&mut self, s: impl Into<String>) -> Arc<str> {
100        self.intern_string(s.into())
101    }
102
103    /// Get the number of unique strings in the pool.
104    #[must_use]
105    pub fn len(&self) -> usize {
106        self.pool.len()
107    }
108
109    /// Check if the pool is empty.
110    #[must_use]
111    pub fn is_empty(&self) -> bool {
112        self.pool.is_empty()
113    }
114
115    /// Clear all entries from the pool.
116    pub fn clear(&mut self) {
117        self.pool.clear();
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    #[test]
126    fn test_string_pool_new() {
127        let pool = StringPool::new();
128        assert_eq!(pool.len(), 0);
129        assert!(pool.is_empty());
130    }
131
132    #[test]
133    fn test_string_pool_with_capacity() {
134        let pool = StringPool::with_capacity(32);
135        assert_eq!(pool.len(), 0);
136        assert!(pool.is_empty());
137        // Capacity is pre-allocated, so adding strings shouldn't trigger resize
138    }
139
140    #[test]
141    fn test_identical_strings_return_same_arc() {
142        let mut pool = StringPool::new();
143        let s1 = pool.intern_str("Development");
144        let s2 = pool.intern_str("Development");
145
146        // Both should point to the same allocation
147        assert!(Arc::ptr_eq(&s1, &s2));
148        assert_eq!(*s1, *s2);
149        assert_eq!(pool.len(), 1);
150    }
151
152    #[test]
153    fn test_different_strings_return_different_arc() {
154        let mut pool = StringPool::new();
155        let s1 = pool.intern_str("Development");
156        let s2 = pool.intern_str("Review");
157
158        // Should point to different allocations
159        assert!(!Arc::ptr_eq(&s1, &s2));
160        assert_ne!(*s1, *s2);
161        assert_eq!(pool.len(), 2);
162    }
163
164    #[test]
165    fn test_pool_size_does_not_grow_for_repeated_strings() {
166        let mut pool = StringPool::new();
167
168        // Intern the same string multiple times
169        for _ in 0..100 {
170            pool.intern_str("Development");
171        }
172
173        // Pool should still only contain one entry
174        assert_eq!(pool.len(), 1);
175    }
176
177    #[test]
178    fn test_intern_different_string_types() {
179        let mut pool = StringPool::new();
180
181        // Test with &str
182        let s1 = pool.intern_str("test");
183        // Test with String
184        let s2 = pool.intern("test".to_string());
185        // Test with owned String
186        let s3 = pool.intern(String::from("test"));
187
188        // All should point to the same allocation
189        assert!(Arc::ptr_eq(&s1, &s2));
190        assert!(Arc::ptr_eq(&s2, &s3));
191        assert_eq!(pool.len(), 1);
192    }
193
194    #[test]
195    fn test_intern_str_and_intern_string_share_entries() {
196        // Regression test: the pool should store a single interned Arc<str> per
197        // unique string, regardless of whether callers use &str or String.
198        let mut pool = StringPool::new();
199
200        let s1 = pool.intern_str("test");
201        let s2 = pool.intern("test".to_string());
202        let s3 = pool.intern(String::from("test"));
203
204        assert!(Arc::ptr_eq(&s1, &s2));
205        assert!(Arc::ptr_eq(&s2, &s3));
206        assert_eq!(pool.len(), 1);
207    }
208
209    #[test]
210    fn test_clear() {
211        let mut pool = StringPool::new();
212        pool.intern_str("Development");
213        pool.intern_str("Review");
214        assert_eq!(pool.len(), 2);
215
216        pool.clear();
217        assert_eq!(pool.len(), 0);
218        assert!(pool.is_empty());
219    }
220
221    #[test]
222    fn test_arc_content_matches_input() {
223        let mut pool = StringPool::new();
224        let arc = pool.intern_str("Development");
225        assert_eq!(&*arc, "Development");
226    }
227
228    #[test]
229    fn test_memory_efficiency_multiple_calls() {
230        let mut pool = StringPool::new();
231        let mut arcs = Vec::new();
232
233        // Create 1000 references to the same string
234        for _ in 0..1000 {
235            arcs.push(pool.intern_str("Development"));
236        }
237
238        // Pool should still only contain one entry
239        assert_eq!(pool.len(), 1);
240
241        // All arcs should point to the same allocation
242        for i in 1..arcs.len() {
243            assert!(Arc::ptr_eq(&arcs[0], &arcs[i]));
244        }
245    }
246
247    #[test]
248    fn test_empty_string() {
249        let mut pool = StringPool::new();
250        let s1 = pool.intern_str("");
251        let s2 = pool.intern_str("");
252
253        assert!(Arc::ptr_eq(&s1, &s2));
254        assert_eq!(&*s1, "");
255        assert_eq!(pool.len(), 1);
256    }
257
258    #[test]
259    fn test_clone_pool() {
260        let mut pool = StringPool::new();
261        pool.intern_str("Development");
262        pool.intern_str("Review");
263
264        let cloned = pool.clone();
265        assert_eq!(pool.len(), cloned.len());
266    }
267}