Skip to main content

ralph_workflow/checkpoint/
string_pool.rs

1//! String interning for deduplicating repeated strings in execution history.
2//!
3//! This module provides a string pool that deduplicates commonly repeated strings
4//! (like phase names and agent names) by storing them as Arc<str>. This reduces
5//! memory usage when the same strings appear many times across execution history.
6
7use std::collections::HashSet;
8use std::iter;
9use std::sync::Arc;
10
11/// String pool for deduplicating commonly repeated strings in execution history.
12///
13/// Phase names and agent names are repeated frequently across execution history
14/// entries. Using Arc<str> with a string pool reduces memory usage by sharing
15/// the same allocation for identical strings.
16///
17/// # Example
18///
19/// ```
20/// use ralph_workflow::checkpoint::string_pool::StringPool;
21/// use std::sync::Arc;
22///
23/// let mut pool = StringPool::new();
24/// let phase1 = pool.intern("Development");
25/// let phase2 = pool.intern("Development");
26///
27/// // Both Arc<str> values point to the same allocation
28/// assert!(Arc::ptr_eq(&phase1, &phase2));
29/// ```
30#[derive(Debug, Clone, Default)]
31pub struct StringPool {
32    // Store a single allocation per unique string (the Arc payload).
33    // Using `Arc<str>` as the set key enables cheap cloning and lookup by `&str`.
34    pool: HashSet<Arc<str>>,
35}
36
37impl StringPool {
38    /// Create a new string pool with default capacity hint.
39    ///
40    /// Pre-allocates capacity for 16 unique strings, which is typical for
41    /// most pipeline runs (phase names, agent names, step types).
42    #[must_use]
43    pub fn new() -> Self {
44        Self::with_capacity(16)
45    }
46
47    /// Create a string pool with specific capacity.
48    ///
49    /// Use this when you know the expected number of unique strings to avoid
50    /// hash table resizing during initial population.
51    #[must_use]
52    pub fn with_capacity(capacity: usize) -> Self {
53        Self {
54            pool: HashSet::with_capacity(capacity),
55        }
56    }
57
58    /// Get or insert a string slice into the pool, returning an Arc<str>.
59    ///
60    /// Prefer this when the input is already a `&str` to avoid allocating a
61    /// temporary `String` on repeated calls.
62    ///
63    /// # Example
64    ///
65    /// ```
66    /// use ralph_workflow::checkpoint::string_pool::StringPool;
67    ///
68    /// let (pool, s1) = StringPool::new().intern_str("test");
69    /// let (pool, s2) = pool.intern_str("test");
70    /// assert!(std::sync::Arc::ptr_eq(&s1, &s2));
71    /// ```
72    #[must_use]
73    pub fn intern_str(self, s: &str) -> (Self, Arc<str>) {
74        if let Some(existing) = self.pool.get(s).map(Arc::clone) {
75            return (self, existing);
76        }
77
78        let interned: Arc<str> = Arc::from(s);
79        let pool = self
80            .pool
81            .into_iter()
82            .chain(iter::once(Arc::clone(&interned)))
83            .collect();
84        (Self { pool }, interned)
85    }
86
87    /// Get or insert an owned string into the pool, returning an Arc<str>.
88    ///
89    /// This path can reuse the allocation of the provided `String` when inserting.
90    #[must_use]
91    pub fn intern_string(self, s: String) -> (Self, Arc<str>) {
92        if let Some(existing) = self.pool.get(s.as_str()).map(Arc::clone) {
93            return (self, existing);
94        }
95
96        let interned: Arc<str> = Arc::from(s);
97        let pool = self
98            .pool
99            .into_iter()
100            .chain(iter::once(Arc::clone(&interned)))
101            .collect();
102        (Self { pool }, interned)
103    }
104
105    /// Backward-compatible convenience: accepts any `Into<String>`.
106    ///
107    /// Note: callers passing `&str` should prefer `intern_str()` to avoid
108    /// allocating a temporary `String` on repeated lookups.
109    pub fn intern(self, s: impl Into<String>) -> (Self, Arc<str>) {
110        self.intern_string(s.into())
111    }
112
113    /// Get the number of unique strings in the pool.
114    #[must_use]
115    pub fn len(&self) -> usize {
116        self.pool.len()
117    }
118
119    /// Check if the pool is empty.
120    #[must_use]
121    pub fn is_empty(&self) -> bool {
122        self.pool.is_empty()
123    }
124
125    /// Clear all entries from the pool.
126    #[must_use]
127    pub fn clear(self) -> Self {
128        Self::with_capacity(16)
129    }
130}
131
132#[cfg(test)]
133mod tests {
134    use super::*;
135
136    #[test]
137    fn test_string_pool_new() {
138        let pool = StringPool::new();
139        assert_eq!(pool.len(), 0);
140        assert!(pool.is_empty());
141    }
142
143    #[test]
144    fn test_string_pool_with_capacity() {
145        let pool = StringPool::with_capacity(32);
146        assert_eq!(pool.len(), 0);
147        assert!(pool.is_empty());
148        // Capacity is pre-allocated, so adding strings shouldn't trigger resize
149    }
150
151    #[test]
152    fn test_identical_strings_return_same_arc() {
153        let (pool, s1) = StringPool::new().intern_str("Development");
154        let (pool, s2) = pool.intern_str("Development");
155
156        // Both should point to the same allocation
157        assert!(Arc::ptr_eq(&s1, &s2));
158        assert_eq!(*s1, *s2);
159        assert_eq!(pool.len(), 1);
160    }
161
162    #[test]
163    fn test_different_strings_return_different_arc() {
164        let (pool, s1) = StringPool::new().intern_str("Development");
165        let (pool, s2) = pool.intern_str("Review");
166
167        // Should point to different allocations
168        assert!(!Arc::ptr_eq(&s1, &s2));
169        assert_ne!(*s1, *s2);
170        assert_eq!(pool.len(), 2);
171    }
172
173    #[test]
174    fn test_pool_size_does_not_grow_for_repeated_strings() {
175        let pool = (0..100).fold(StringPool::new(), |pool, _| {
176            pool.intern_str("Development").0
177        });
178
179        // Pool should still only contain one entry
180        assert_eq!(pool.len(), 1);
181    }
182
183    #[test]
184    fn test_intern_different_string_types() {
185        let (pool, s1) = StringPool::new().intern_str("test");
186
187        // Test with &str
188        let (pool, s2) = pool.intern("test".to_string());
189        let (pool, s3) = pool.intern(String::from("test"));
190
191        // All should point to the same allocation
192        assert!(Arc::ptr_eq(&s1, &s2));
193        assert!(Arc::ptr_eq(&s2, &s3));
194        assert_eq!(pool.len(), 1);
195    }
196
197    #[test]
198    fn test_intern_str_and_intern_string_share_entries() {
199        // Regression test: the pool should store a single interned Arc<str> per
200        // unique string, regardless of whether callers use &str or String.
201        let (pool, s1) = StringPool::new().intern_str("test");
202
203        let (pool, s2) = pool.intern("test".to_string());
204        let (pool, s3) = pool.intern(String::from("test"));
205
206        assert!(Arc::ptr_eq(&s1, &s2));
207        assert!(Arc::ptr_eq(&s2, &s3));
208        assert_eq!(pool.len(), 1);
209    }
210
211    #[test]
212    fn test_clear() {
213        let pool = StringPool::new()
214            .intern_str("Development")
215            .0
216            .intern_str("Review")
217            .0;
218        assert_eq!(pool.len(), 2);
219
220        let pool = pool.clear();
221        assert_eq!(pool.len(), 0);
222        assert!(pool.is_empty());
223    }
224
225    #[test]
226    fn test_arc_content_matches_input() {
227        let arc = StringPool::new().intern_str("Development").1;
228        assert_eq!(&*arc, "Development");
229    }
230
231    #[test]
232    fn test_memory_efficiency_multiple_calls() {
233        let pool = (0..1000).fold(StringPool::new(), |pool, _| {
234            pool.intern_str("Development").0
235        });
236
237        // Pool should still only contain one entry (deduplication works)
238        assert_eq!(pool.len(), 1);
239
240        // Interning from a single pool multiple times produces same Arc
241        let arcs: Vec<_> = (0..1000)
242            .map(|_| pool.clone().intern_str("Development").1)
243            .collect();
244
245        // All arcs from the same pool should point to the same allocation
246        assert!((1..arcs.len()).all(|i| Arc::ptr_eq(&arcs[0], &arcs[i])));
247    }
248
249    #[test]
250    fn test_empty_string() {
251        let (pool, s1) = StringPool::new().intern_str("");
252        let (pool, s2) = pool.intern_str("");
253
254        assert!(Arc::ptr_eq(&s1, &s2));
255        assert_eq!(&*s1, "");
256        assert_eq!(pool.len(), 1);
257    }
258
259    #[test]
260    fn test_clone_pool() {
261        let pool = StringPool::new()
262            .intern_str("Development")
263            .0
264            .intern_str("Review")
265            .0;
266
267        let cloned = pool.clone();
268        assert_eq!(pool.len(), cloned.len());
269    }
270}