Skip to main content

oxihuman_core/
string_pool.rs

1//! String interning/pooling for memory efficiency.
2
3use std::collections::HashMap;
4
5/// Opaque handle to an interned string.
6#[allow(dead_code)]
7#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
8pub struct StringId(pub u32);
9
10/// Pool that deduplicates and interns strings.
11#[allow(dead_code)]
12#[derive(Clone, Debug)]
13pub struct StringPool {
14    /// Map from string content to its id.
15    lookup: HashMap<String, StringId>,
16    /// Reverse map from id to string content.
17    strings: Vec<String>,
18}
19
20// ---------------------------------------------------------------------------
21// Construction
22// ---------------------------------------------------------------------------
23
24/// Create a new, empty string pool.
25#[allow(dead_code)]
26pub fn new_string_pool() -> StringPool {
27    StringPool {
28        lookup: HashMap::new(),
29        strings: Vec::new(),
30    }
31}
32
33// ---------------------------------------------------------------------------
34// Core operations
35// ---------------------------------------------------------------------------
36
37/// Intern a string, returning its `StringId`. If the string is already interned,
38/// the existing id is returned without duplication.
39#[allow(dead_code)]
40pub fn intern(pool: &mut StringPool, s: &str) -> StringId {
41    if let Some(&id) = pool.lookup.get(s) {
42        return id;
43    }
44    let id = StringId(pool.strings.len() as u32);
45    pool.strings.push(s.to_string());
46    pool.lookup.insert(s.to_string(), id);
47    id
48}
49
50/// Resolve a `StringId` back to its string content.
51/// Returns `None` if the id is invalid.
52#[allow(dead_code)]
53pub fn resolve(pool: &StringPool, id: StringId) -> Option<&str> {
54    pool.strings.get(id.0 as usize).map(|s| s.as_str())
55}
56
57/// Check whether a string is already interned.
58#[allow(dead_code)]
59pub fn contains(pool: &StringPool, s: &str) -> bool {
60    pool.lookup.contains_key(s)
61}
62
63/// Return the number of unique strings in the pool.
64#[allow(dead_code)]
65pub fn pool_size(pool: &StringPool) -> usize {
66    pool.strings.len()
67}
68
69/// Return the total number of bytes stored across all interned strings.
70#[allow(dead_code)]
71pub fn total_bytes(pool: &StringPool) -> usize {
72    pool.strings.iter().map(|s| s.len()).sum()
73}
74
75/// Intern multiple strings at once, returning their ids.
76#[allow(dead_code)]
77pub fn intern_many(pool: &mut StringPool, strings: &[&str]) -> Vec<StringId> {
78    strings.iter().map(|s| intern(pool, s)).collect()
79}
80
81/// Remove strings that are not in the `keep` set.
82/// Returns the number of strings removed.
83///
84/// Note: This invalidates all existing `StringId` handles. The pool is
85/// rebuilt with new ids.
86#[allow(dead_code)]
87pub fn remove_unused(pool: &mut StringPool, keep: &[StringId]) -> usize {
88    let keep_set: std::collections::HashSet<u32> = keep.iter().map(|id| id.0).collect();
89    let original_count = pool.strings.len();
90
91    let retained: Vec<String> = pool
92        .strings
93        .iter()
94        .enumerate()
95        .filter(|(i, _)| keep_set.contains(&(*i as u32)))
96        .map(|(_, s)| s.clone())
97        .collect();
98
99    pool.strings = retained;
100    pool.lookup.clear();
101    for (i, s) in pool.strings.iter().enumerate() {
102        pool.lookup.insert(s.clone(), StringId(i as u32));
103    }
104
105    original_count - pool.strings.len()
106}
107
108/// Check whether a `StringId` is valid in the current pool.
109#[allow(dead_code)]
110pub fn string_id_valid(pool: &StringPool, id: StringId) -> bool {
111    (id.0 as usize) < pool.strings.len()
112}
113
114/// Return pool statistics as a JSON string.
115#[allow(dead_code)]
116pub fn pool_stats_json(pool: &StringPool) -> String {
117    let count = pool_size(pool);
118    let bytes = total_bytes(pool);
119    let avg = if count > 0 {
120        bytes as f64 / count as f64
121    } else {
122        0.0
123    };
124    format!(
125        "{{\"unique_strings\":{},\"total_bytes\":{},\"average_length\":{:.2}}}",
126        count, bytes, avg
127    )
128}
129
130/// Remove all strings from the pool.
131#[allow(dead_code)]
132pub fn clear_pool(pool: &mut StringPool) {
133    pool.strings.clear();
134    pool.lookup.clear();
135}
136
137/// Merge another pool's strings into this pool.
138/// Returns the number of newly added strings.
139#[allow(dead_code)]
140pub fn merge_pools(dst: &mut StringPool, src: &StringPool) -> usize {
141    let before = pool_size(dst);
142    for s in &src.strings {
143        intern(dst, s);
144    }
145    pool_size(dst) - before
146}
147
148/// Find all interned strings that start with the given prefix.
149/// Returns their `StringId`s.
150#[allow(dead_code)]
151pub fn find_by_prefix(pool: &StringPool, prefix: &str) -> Vec<StringId> {
152    pool.strings
153        .iter()
154        .enumerate()
155        .filter(|(_, s)| s.starts_with(prefix))
156        .map(|(i, _)| StringId(i as u32))
157        .collect()
158}
159
160// ---------------------------------------------------------------------------
161// Tests
162// ---------------------------------------------------------------------------
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    #[test]
169    fn test_new_string_pool_empty() {
170        let pool = new_string_pool();
171        assert_eq!(pool_size(&pool), 0);
172        assert_eq!(total_bytes(&pool), 0);
173    }
174
175    #[test]
176    fn test_intern_returns_id() {
177        let mut pool = new_string_pool();
178        let id = intern(&mut pool, "hello");
179        assert_eq!(id.0, 0);
180    }
181
182    #[test]
183    fn test_intern_deduplicates() {
184        let mut pool = new_string_pool();
185        let id1 = intern(&mut pool, "hello");
186        let id2 = intern(&mut pool, "hello");
187        assert_eq!(id1, id2);
188        assert_eq!(pool_size(&pool), 1);
189    }
190
191    #[test]
192    fn test_resolve_valid() {
193        let mut pool = new_string_pool();
194        let id = intern(&mut pool, "world");
195        assert_eq!(resolve(&pool, id), Some("world"));
196    }
197
198    #[test]
199    fn test_resolve_invalid() {
200        let pool = new_string_pool();
201        assert_eq!(resolve(&pool, StringId(999)), None);
202    }
203
204    #[test]
205    fn test_contains() {
206        let mut pool = new_string_pool();
207        intern(&mut pool, "abc");
208        assert!(contains(&pool, "abc"));
209        assert!(!contains(&pool, "xyz"));
210    }
211
212    #[test]
213    fn test_pool_size() {
214        let mut pool = new_string_pool();
215        intern(&mut pool, "a");
216        intern(&mut pool, "b");
217        intern(&mut pool, "c");
218        assert_eq!(pool_size(&pool), 3);
219    }
220
221    #[test]
222    fn test_total_bytes() {
223        let mut pool = new_string_pool();
224        intern(&mut pool, "ab"); // 2 bytes
225        intern(&mut pool, "cde"); // 3 bytes
226        assert_eq!(total_bytes(&pool), 5);
227    }
228
229    #[test]
230    fn test_intern_many() {
231        let mut pool = new_string_pool();
232        let ids = intern_many(&mut pool, &["x", "y", "z"]);
233        assert_eq!(ids.len(), 3);
234        assert_eq!(pool_size(&pool), 3);
235    }
236
237    #[test]
238    fn test_remove_unused() {
239        let mut pool = new_string_pool();
240        let id0 = intern(&mut pool, "keep");
241        intern(&mut pool, "remove");
242        let removed = remove_unused(&mut pool, &[id0]);
243        assert_eq!(removed, 1);
244        assert_eq!(pool_size(&pool), 1);
245        assert!(contains(&pool, "keep"));
246    }
247
248    #[test]
249    fn test_string_id_valid() {
250        let mut pool = new_string_pool();
251        let id = intern(&mut pool, "test");
252        assert!(string_id_valid(&pool, id));
253        assert!(!string_id_valid(&pool, StringId(100)));
254    }
255
256    #[test]
257    fn test_pool_stats_json() {
258        let mut pool = new_string_pool();
259        intern(&mut pool, "abc");
260        let json = pool_stats_json(&pool);
261        assert!(json.contains("\"unique_strings\":1"));
262        assert!(json.contains("\"total_bytes\":3"));
263    }
264
265    #[test]
266    fn test_clear_pool() {
267        let mut pool = new_string_pool();
268        intern(&mut pool, "a");
269        intern(&mut pool, "b");
270        clear_pool(&mut pool);
271        assert_eq!(pool_size(&pool), 0);
272    }
273
274    #[test]
275    fn test_merge_pools() {
276        let mut pool1 = new_string_pool();
277        intern(&mut pool1, "a");
278        let mut pool2 = new_string_pool();
279        intern(&mut pool2, "b");
280        intern(&mut pool2, "c");
281        let added = merge_pools(&mut pool1, &pool2);
282        assert_eq!(added, 2);
283        assert_eq!(pool_size(&pool1), 3);
284    }
285
286    #[test]
287    fn test_merge_pools_no_duplicates() {
288        let mut pool1 = new_string_pool();
289        intern(&mut pool1, "shared");
290        let mut pool2 = new_string_pool();
291        intern(&mut pool2, "shared");
292        intern(&mut pool2, "new");
293        let added = merge_pools(&mut pool1, &pool2);
294        assert_eq!(added, 1); // only "new" is added
295    }
296
297    #[test]
298    fn test_find_by_prefix() {
299        let mut pool = new_string_pool();
300        intern(&mut pool, "morph_face");
301        intern(&mut pool, "morph_body");
302        intern(&mut pool, "texture_skin");
303        let results = find_by_prefix(&pool, "morph_");
304        assert_eq!(results.len(), 2);
305    }
306
307    #[test]
308    fn test_find_by_prefix_empty() {
309        let mut pool = new_string_pool();
310        intern(&mut pool, "abc");
311        let results = find_by_prefix(&pool, "xyz");
312        assert!(results.is_empty());
313    }
314}