Skip to main content

ralph_workflow/checkpoint/
string_pool.rs

1//! String interning for deduplicating repeated strings in execution history.
2//!
3//! This module provides a string pool that deduplicates commonly repeated strings
4//! (like phase names and agent names) by storing them as Arc<str>. This reduces
5//! memory usage when the same strings appear many times across execution history.
6
7use std::collections::HashSet;
8use std::iter;
9use std::sync::Arc;
10
11/// String pool for deduplicating commonly repeated strings in execution history.
12///
13/// Phase names and agent names are repeated frequently across execution history
14/// entries. Using Arc<str> with a string pool reduces memory usage by sharing
15/// the same allocation for identical strings.
16///
17/// # Example
18///
19/// ```
20/// use ralph_workflow::checkpoint::string_pool::StringPool;
21/// use std::sync::Arc;
22///
23/// let (pool, phase1) = StringPool::new().intern("Development");
24/// let (_, phase2) = pool.intern("Development");
25///
26/// // Both Arc<str> values point to the same allocation
27/// assert!(Arc::ptr_eq(&phase1, &phase2));
28/// ```
29#[derive(Debug, Clone, Default)]
30pub struct StringPool {
31    // Store a single allocation per unique string (the Arc payload).
32    // Using `Arc<str>` as the set key enables cheap cloning and lookup by `&str`.
33    pool: HashSet<Arc<str>>,
34}
35
36impl StringPool {
37    /// Create a new string pool with default capacity hint.
38    ///
39    /// Pre-allocates capacity for 16 unique strings, which is typical for
40    /// most pipeline runs (phase names, agent names, step types).
41    #[must_use]
42    pub fn new() -> Self {
43        Self::with_capacity(16)
44    }
45
46    /// Create a string pool with specific capacity.
47    ///
48    /// Use this when you know the expected number of unique strings to avoid
49    /// hash table resizing during initial population.
50    #[must_use]
51    pub fn with_capacity(capacity: usize) -> Self {
52        Self {
53            pool: HashSet::with_capacity(capacity),
54        }
55    }
56
57    /// Get or insert a string slice into the pool, returning an Arc<str>.
58    ///
59    /// Prefer this when the input is already a `&str` to avoid allocating a
60    /// temporary `String` on repeated calls.
61    ///
62    /// # Example
63    ///
64    /// ```
65    /// use ralph_workflow::checkpoint::string_pool::StringPool;
66    ///
67    /// let (pool, s1) = StringPool::new().intern_str("test");
68    /// let (pool, s2) = pool.intern_str("test");
69    /// assert!(std::sync::Arc::ptr_eq(&s1, &s2));
70    /// ```
71    #[must_use]
72    pub fn intern_str(self, s: &str) -> (Self, Arc<str>) {
73        if let Some(existing) = self.pool.get(s).map(Arc::clone) {
74            return (self, existing);
75        }
76
77        let interned: Arc<str> = Arc::from(s);
78        let pool = self
79            .pool
80            .into_iter()
81            .chain(iter::once(Arc::clone(&interned)))
82            .collect();
83        (Self { pool }, interned)
84    }
85
86    /// Get or insert an owned string into the pool, returning an Arc<str>.
87    ///
88    /// This path can reuse the allocation of the provided `String` when inserting.
89    #[must_use]
90    pub fn intern_string(self, s: String) -> (Self, Arc<str>) {
91        if let Some(existing) = self.pool.get(s.as_str()).map(Arc::clone) {
92            return (self, existing);
93        }
94
95        let interned: Arc<str> = Arc::from(s);
96        let pool = self
97            .pool
98            .into_iter()
99            .chain(iter::once(Arc::clone(&interned)))
100            .collect();
101        (Self { pool }, interned)
102    }
103
104    /// Backward-compatible convenience: accepts any `Into<String>`.
105    ///
106    /// Note: callers passing `&str` should prefer `intern_str()` to avoid
107    /// allocating a temporary `String` on repeated lookups.
108    pub fn intern(self, s: impl Into<String>) -> (Self, Arc<str>) {
109        self.intern_string(s.into())
110    }
111
112    /// Get the number of unique strings in the pool.
113    #[must_use]
114    pub fn len(&self) -> usize {
115        self.pool.len()
116    }
117
118    /// Check if the pool is empty.
119    #[must_use]
120    pub fn is_empty(&self) -> bool {
121        self.pool.is_empty()
122    }
123
124    /// Clear all entries from the pool.
125    #[must_use]
126    pub fn clear(self) -> Self {
127        Self::with_capacity(16)
128    }
129}
130
131#[cfg(test)]
132mod tests {
133    use super::*;
134
135    #[test]
136    fn test_string_pool_new() {
137        let pool = StringPool::new();
138        assert_eq!(pool.len(), 0);
139        assert!(pool.is_empty());
140    }
141
142    #[test]
143    fn test_string_pool_with_capacity() {
144        let pool = StringPool::with_capacity(32);
145        assert_eq!(pool.len(), 0);
146        assert!(pool.is_empty());
147        // Capacity is pre-allocated, so adding strings shouldn't trigger resize
148    }
149
150    #[test]
151    fn test_identical_strings_return_same_arc() {
152        let (pool, s1) = StringPool::new().intern_str("Development");
153        let (pool, s2) = pool.intern_str("Development");
154
155        // Both should point to the same allocation
156        assert!(Arc::ptr_eq(&s1, &s2));
157        assert_eq!(*s1, *s2);
158        assert_eq!(pool.len(), 1);
159    }
160
161    #[test]
162    fn test_different_strings_return_different_arc() {
163        let (pool, s1) = StringPool::new().intern_str("Development");
164        let (pool, s2) = pool.intern_str("Review");
165
166        // Should point to different allocations
167        assert!(!Arc::ptr_eq(&s1, &s2));
168        assert_ne!(*s1, *s2);
169        assert_eq!(pool.len(), 2);
170    }
171
172    #[test]
173    fn test_pool_size_does_not_grow_for_repeated_strings() {
174        let pool = (0..100).fold(StringPool::new(), |pool, _| {
175            pool.intern_str("Development").0
176        });
177
178        // Pool should still only contain one entry
179        assert_eq!(pool.len(), 1);
180    }
181
182    #[test]
183    fn test_intern_different_string_types() {
184        let (pool, s1) = StringPool::new().intern_str("test");
185
186        // Test with &str
187        let (pool, s2) = pool.intern("test".to_string());
188        let (pool, s3) = pool.intern(String::from("test"));
189
190        // All should point to the same allocation
191        assert!(Arc::ptr_eq(&s1, &s2));
192        assert!(Arc::ptr_eq(&s2, &s3));
193        assert_eq!(pool.len(), 1);
194    }
195
196    #[test]
197    fn test_intern_str_and_intern_string_share_entries() {
198        // Regression test: the pool should store a single interned Arc<str> per
199        // unique string, regardless of whether callers use &str or String.
200        let (pool, s1) = StringPool::new().intern_str("test");
201
202        let (pool, s2) = pool.intern("test".to_string());
203        let (pool, s3) = pool.intern(String::from("test"));
204
205        assert!(Arc::ptr_eq(&s1, &s2));
206        assert!(Arc::ptr_eq(&s2, &s3));
207        assert_eq!(pool.len(), 1);
208    }
209
210    #[test]
211    fn test_clear() {
212        let pool = StringPool::new()
213            .intern_str("Development")
214            .0
215            .intern_str("Review")
216            .0;
217        assert_eq!(pool.len(), 2);
218
219        let pool = pool.clear();
220        assert_eq!(pool.len(), 0);
221        assert!(pool.is_empty());
222    }
223
224    #[test]
225    fn test_arc_content_matches_input() {
226        let arc = StringPool::new().intern_str("Development").1;
227        assert_eq!(&*arc, "Development");
228    }
229
230    #[test]
231    fn test_memory_efficiency_multiple_calls() {
232        let pool = (0..1000).fold(StringPool::new(), |pool, _| {
233            pool.intern_str("Development").0
234        });
235
236        // Pool should still only contain one entry (deduplication works)
237        assert_eq!(pool.len(), 1);
238
239        // Interning from a single pool multiple times produces same Arc
240        let arcs: Vec<_> = (0..1000)
241            .map(|_| pool.clone().intern_str("Development").1)
242            .collect();
243
244        // All arcs from the same pool should point to the same allocation
245        assert!((1..arcs.len()).all(|i| Arc::ptr_eq(&arcs[0], &arcs[i])));
246    }
247
248    #[test]
249    fn test_empty_string() {
250        let (pool, s1) = StringPool::new().intern_str("");
251        let (pool, s2) = pool.intern_str("");
252
253        assert!(Arc::ptr_eq(&s1, &s2));
254        assert_eq!(&*s1, "");
255        assert_eq!(pool.len(), 1);
256    }
257
258    #[test]
259    fn test_clone_pool() {
260        let pool = StringPool::new()
261            .intern_str("Development")
262            .0
263            .intern_str("Review")
264            .0;
265
266        let cloned = pool.clone();
267        assert_eq!(pool.len(), cloned.len());
268    }
269}