shape_value/string_intern.rs
1//! Small-string interning (Phase D.4).
2//!
3//! Extracted from `value_word.rs` in Phase R6.3.
4//!
5//! ## Design rationale
6//!
7//! True small-string optimization (SSO) in the sense of "pack the bytes inline
8//! in the 8-byte ValueWord" is not feasible in the current layout: all 8
9//! NaN-boxing tag values (0b000..0b111) are already consumed (see `tag_bits`)
10//! and only 48 bits of payload are available, which is too few bytes to be
11//! useful (strings <= 6 bytes is a rounding error).
12//!
13//! Multi-slot SSO (spreading bytes across 2-3 adjacent stack slots) would
14//! require compiler support for multi-slot string bindings and invasive
15//! changes to the executor + JIT — not worth it as an isolated change.
16//!
17//! Instead, we collapse the common case of **repeated short strings** via
18//! a process-global intern pool. Programs allocate `Arc<String>` over and
19//! over for the same content (field names, enum tags, short literals like
20//! "ok", "id", "name"). With interning, N copies share a single allocation
21//! and the Arc refcount does the rest.
22//!
23//! ## Behavioural contract
24//!
25//! - `ValueWord::from_string(s)` still returns a `ValueWord` wrapping
26//! `Arc<String>`. Callers observe no change: `as_string()` / `as_heap_ref()`
27//! return the same `&str` content. Mutation is already impossible via
28//! `Arc<String>` (no `Arc::make_mut` is called on interned strings in the
29//! codebase — all string ops produce a new `String`).
30//! - Long strings (len > `INTERN_THRESHOLD`) bypass the pool entirely: the
31//! hash/lookup cost isn't justified for long unique payloads, and the
32//! memory win would be marginal.
33//! - The pool is bounded by `INTERN_CAP` entries. When full, new lookups
34//! fall through to the no-intern path — we never evict, keeping all live
35//! `Arc<String>` refs valid.
36//! - The pool uses `std::sync::LazyLock<Mutex<...>>`. A
37//! `HashMap<Arc<String>, ()>` (set
38//! semantics keyed by the Arc's string content) would work, but using
39//! `HashMap<Arc<String>, Arc<String>>` lets us return the *canonical* Arc
40//! without rebuilding one.
41//!
42//! ## Future work
43//!
44//! A fully-inline SSO (store up to ~22 bytes inline across a 24-byte heap
45//! object with its own refcount) would eliminate the outer `Arc` allocation
46//! entirely for short strings. That's a bigger change — it touches the
47//! HeapValue representation, VM executor string reads, JIT FFI, and wire
48//! serialization. Revisit once the `StringObj` / `UnifiedString` v2 paths
49//! are the primary runtime representation.
50
51use std::collections::HashMap;
52use std::sync::{Arc, LazyLock, Mutex};
53
54/// Strings with byte length <= this value are candidates for interning.
55/// Chosen to cover common field names, enum tags, and short literals
56/// (e.g. "ok", "err", "id", "name", "type", "value") while excluding
57/// long user content where the hash cost dominates.
58pub const INTERN_THRESHOLD: usize = 32;
59
60/// Hard cap on pool size. When reached, new strings bypass interning.
61/// Sized to comfortably fit all stdlib field names + enum tags + common
62/// literals across a large program. Entries are never evicted once
63/// inserted (the pool owns an Arc ref keeping the string alive).
64pub const INTERN_CAP: usize = 8192;
65
66static POOL: LazyLock<Mutex<HashMap<Arc<String>, Arc<String>>>> =
67 LazyLock::new(|| Mutex::new(HashMap::with_capacity(256)));
68
69/// Return the canonical `Arc<String>` for `s` if `s` is short enough to
70/// intern; otherwise return `s` unchanged. Callers should always use
71/// the returned Arc — it may be a different (shared) pointer than the
72/// input.
73#[inline]
74pub fn intern_short_string(s: Arc<String>) -> Arc<String> {
75 if s.len() > INTERN_THRESHOLD {
76 return s;
77 }
78 // Acquire the lock. If the mutex is poisoned (another thread panicked
79 // while holding it), fall through without interning rather than
80 // propagating the panic — interning is an optimization, not a
81 // correctness requirement.
82 let mut pool = match POOL.lock() {
83 Ok(guard) => guard,
84 Err(_) => return s,
85 };
86 if let Some(existing) = pool.get(&s) {
87 return existing.clone();
88 }
89 if pool.len() >= INTERN_CAP {
90 return s;
91 }
92 pool.insert(s.clone(), s.clone());
93 s
94}
95
96/// Test-only: current pool size. Pool entries are never cleared, so
97/// tests should use deltas (not absolute values) to verify growth.
98#[cfg(test)]
99pub(crate) fn __test_pool_len() -> usize {
100 POOL.lock().map(|p| p.len()).unwrap_or(0)
101}
102
103#[cfg(test)]
104mod tests {
105 use super::*;
106
107 // The intern pool is a process-global resource and tests run in parallel,
108 // so assertions must be robust to cross-test pollution. We use string
109 // contents that are unlikely to appear in other tests (unique prefixes
110 // per test) and check relative behavior ("two calls with the same content
111 // return Arc-equal results") rather than absolute pool sizes where
112 // possible.
113
114 #[test]
115 fn test_intern_short_strings_share_allocation() {
116 // Two separately-allocated `Arc<String>` with identical content should
117 // deduplicate to a single canonical Arc after going through the pool.
118 let a = intern_short_string(Arc::new("intern_test_share_name".to_string()));
119 let b = intern_short_string(Arc::new("intern_test_share_name".to_string()));
120 assert!(Arc::ptr_eq(&a, &b), "interned short strings must share allocation");
121 assert_eq!(&*a, "intern_test_share_name");
122 }
123
124 #[test]
125 fn test_intern_long_strings_bypass_pool() {
126 // A string longer than INTERN_THRESHOLD bypasses the pool.
127 // Use a unique prefix so other parallel tests can't collide.
128 let long = format!("intern_test_long_{}", "x".repeat(INTERN_THRESHOLD + 1));
129 assert!(long.len() > INTERN_THRESHOLD);
130 let a = intern_short_string(Arc::new(long.clone()));
131 let b = intern_short_string(Arc::new(long.clone()));
132 // Both pass through untouched; they are NOT the same allocation.
133 assert!(!Arc::ptr_eq(&a, &b), "long strings must not be interned");
134 assert_eq!(&*a, &*b);
135 }
136
137 #[test]
138 fn test_intern_threshold_boundary() {
139 // Exactly at the threshold: interned. (Use a fixed-length unique
140 // string — "aaaa..." padded to exactly THRESHOLD bytes.)
141 let at: String = std::iter::repeat('a').take(INTERN_THRESHOLD).collect();
142 let a1 = intern_short_string(Arc::new(at.clone()));
143 let a2 = intern_short_string(Arc::new(at.clone()));
144 assert!(Arc::ptr_eq(&a1, &a2), "len == threshold must intern");
145
146 // One past the threshold: NOT interned.
147 let over: String = std::iter::repeat('b').take(INTERN_THRESHOLD + 1).collect();
148 let b1 = intern_short_string(Arc::new(over.clone()));
149 let b2 = intern_short_string(Arc::new(over.clone()));
150 assert!(!Arc::ptr_eq(&b1, &b2), "len > threshold must not intern");
151 }
152
153 #[test]
154 fn test_intern_preserves_content_across_many_calls() {
155 // Repeatedly interning varied short strings returns correct content
156 // on every call, including for repeated inputs.
157 let inputs = [
158 "intern_test_many_a",
159 "intern_test_many_b",
160 "intern_test_many_c",
161 "intern_test_many_a", // duplicate
162 "intern_test_many_b", // duplicate
163 ];
164 let mut results = Vec::new();
165 for s in inputs {
166 results.push(intern_short_string(Arc::new(s.to_string())));
167 }
168 for (r, s) in results.iter().zip(inputs.iter()) {
169 assert_eq!(&***r, *s);
170 }
171 // Duplicates must be Arc-equal to their first occurrence.
172 assert!(Arc::ptr_eq(&results[0], &results[3]), "dup 'a' must share Arc");
173 assert!(Arc::ptr_eq(&results[1], &results[4]), "dup 'b' must share Arc");
174 }
175}