Skip to main content

shape_value/
string_intern.rs

1//! Small-string interning (Phase D.4).
2//!
3//! Extracted from `value_word.rs` in Phase R6.3.
4//!
5//! ## Design rationale
6//!
7//! True small-string optimization (SSO) in the sense of "pack the bytes inline
8//! in the 8-byte ValueWord" is not feasible in the current layout: all 8
9//! NaN-boxing tag values (0b000..0b111) are already consumed (see `tag_bits`)
10//! and only 48 bits of payload are available, which is too few bytes to be
11//! useful (strings <= 6 bytes is a rounding error).
12//!
13//! Multi-slot SSO (spreading bytes across 2-3 adjacent stack slots) would
14//! require compiler support for multi-slot string bindings and invasive
15//! changes to the executor + JIT — not worth it as an isolated change.
16//!
17//! Instead, we collapse the common case of **repeated short strings** via
18//! a process-global intern pool. Programs allocate `Arc<String>` over and
19//! over for the same content (field names, enum tags, short literals like
20//! "ok", "id", "name"). With interning, N copies share a single allocation
21//! and the Arc refcount does the rest.
22//!
23//! ## Behavioural contract
24//!
25//! - `ValueWord::from_string(s)` still returns a `ValueWord` wrapping
26//!   `Arc<String>`. Callers observe no change: `as_string()` / `as_heap_ref()`
27//!   return the same `&str` content. Mutation is already impossible via
28//!   `Arc<String>` (no `Arc::make_mut` is called on interned strings in the
29//!   codebase — all string ops produce a new `String`).
30//! - Long strings (len > `INTERN_THRESHOLD`) bypass the pool entirely: the
31//!   hash/lookup cost isn't justified for long unique payloads, and the
32//!   memory win would be marginal.
33//! - The pool is bounded by `INTERN_CAP` entries. When full, new lookups
34//!   fall through to the no-intern path — we never evict, keeping all live
35//!   `Arc<String>` refs valid.
36//! - The pool uses `std::sync::LazyLock<Mutex<...>>`. A
37//!   `HashMap<Arc<String>, ()>` (set
38//!   semantics keyed by the Arc's string content) would work, but using
39//!   `HashMap<Arc<String>, Arc<String>>` lets us return the *canonical* Arc
40//!   without rebuilding one.
41//!
42//! ## Future work
43//!
44//! A fully-inline SSO (store up to ~22 bytes inline across a 24-byte heap
45//! object with its own refcount) would eliminate the outer `Arc` allocation
46//! entirely for short strings. That's a bigger change — it touches the
47//! HeapValue representation, VM executor string reads, JIT FFI, and wire
48//! serialization. Revisit once the `StringObj` / `UnifiedString` v2 paths
49//! are the primary runtime representation.
50
51use std::collections::HashMap;
52use std::sync::{Arc, LazyLock, Mutex};
53
54/// Strings with byte length <= this value are candidates for interning.
55/// Chosen to cover common field names, enum tags, and short literals
56/// (e.g. "ok", "err", "id", "name", "type", "value") while excluding
57/// long user content where the hash cost dominates.
58pub const INTERN_THRESHOLD: usize = 32;
59
60/// Hard cap on pool size. When reached, new strings bypass interning.
61/// Sized to comfortably fit all stdlib field names + enum tags + common
62/// literals across a large program. Entries are never evicted once
63/// inserted (the pool owns an Arc ref keeping the string alive).
64pub const INTERN_CAP: usize = 8192;
65
66static POOL: LazyLock<Mutex<HashMap<Arc<String>, Arc<String>>>> =
67    LazyLock::new(|| Mutex::new(HashMap::with_capacity(256)));
68
69/// Return the canonical `Arc<String>` for `s` if `s` is short enough to
70/// intern; otherwise return `s` unchanged. Callers should always use
71/// the returned Arc — it may be a different (shared) pointer than the
72/// input.
73#[inline]
74pub fn intern_short_string(s: Arc<String>) -> Arc<String> {
75    if s.len() > INTERN_THRESHOLD {
76        return s;
77    }
78    // Acquire the lock. If the mutex is poisoned (another thread panicked
79    // while holding it), fall through without interning rather than
80    // propagating the panic — interning is an optimization, not a
81    // correctness requirement.
82    let mut pool = match POOL.lock() {
83        Ok(guard) => guard,
84        Err(_) => return s,
85    };
86    if let Some(existing) = pool.get(&s) {
87        return existing.clone();
88    }
89    if pool.len() >= INTERN_CAP {
90        return s;
91    }
92    pool.insert(s.clone(), s.clone());
93    s
94}
95
96/// Test-only: current pool size. Pool entries are never cleared, so
97/// tests should use deltas (not absolute values) to verify growth.
98#[cfg(test)]
99pub(crate) fn __test_pool_len() -> usize {
100    POOL.lock().map(|p| p.len()).unwrap_or(0)
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106
107    // The intern pool is a process-global resource and tests run in parallel,
108    // so assertions must be robust to cross-test pollution. We use string
109    // contents that are unlikely to appear in other tests (unique prefixes
110    // per test) and check relative behavior ("two calls with the same content
111    // return Arc-equal results") rather than absolute pool sizes where
112    // possible.
113
114    #[test]
115    fn test_intern_short_strings_share_allocation() {
116        // Two separately-allocated `Arc<String>` with identical content should
117        // deduplicate to a single canonical Arc after going through the pool.
118        let a = intern_short_string(Arc::new("intern_test_share_name".to_string()));
119        let b = intern_short_string(Arc::new("intern_test_share_name".to_string()));
120        assert!(Arc::ptr_eq(&a, &b), "interned short strings must share allocation");
121        assert_eq!(&*a, "intern_test_share_name");
122    }
123
124    #[test]
125    fn test_intern_long_strings_bypass_pool() {
126        // A string longer than INTERN_THRESHOLD bypasses the pool.
127        // Use a unique prefix so other parallel tests can't collide.
128        let long = format!("intern_test_long_{}", "x".repeat(INTERN_THRESHOLD + 1));
129        assert!(long.len() > INTERN_THRESHOLD);
130        let a = intern_short_string(Arc::new(long.clone()));
131        let b = intern_short_string(Arc::new(long.clone()));
132        // Both pass through untouched; they are NOT the same allocation.
133        assert!(!Arc::ptr_eq(&a, &b), "long strings must not be interned");
134        assert_eq!(&*a, &*b);
135    }
136
137    #[test]
138    fn test_intern_threshold_boundary() {
139        // Exactly at the threshold: interned. (Use a fixed-length unique
140        // string — "aaaa..." padded to exactly THRESHOLD bytes.)
141        let at: String = std::iter::repeat('a').take(INTERN_THRESHOLD).collect();
142        let a1 = intern_short_string(Arc::new(at.clone()));
143        let a2 = intern_short_string(Arc::new(at.clone()));
144        assert!(Arc::ptr_eq(&a1, &a2), "len == threshold must intern");
145
146        // One past the threshold: NOT interned.
147        let over: String = std::iter::repeat('b').take(INTERN_THRESHOLD + 1).collect();
148        let b1 = intern_short_string(Arc::new(over.clone()));
149        let b2 = intern_short_string(Arc::new(over.clone()));
150        assert!(!Arc::ptr_eq(&b1, &b2), "len > threshold must not intern");
151    }
152
153    #[test]
154    fn test_intern_preserves_content_across_many_calls() {
155        // Repeatedly interning varied short strings returns correct content
156        // on every call, including for repeated inputs.
157        let inputs = [
158            "intern_test_many_a",
159            "intern_test_many_b",
160            "intern_test_many_c",
161            "intern_test_many_a", // duplicate
162            "intern_test_many_b", // duplicate
163        ];
164        let mut results = Vec::new();
165        for s in inputs {
166            results.push(intern_short_string(Arc::new(s.to_string())));
167        }
168        for (r, s) in results.iter().zip(inputs.iter()) {
169            assert_eq!(&***r, *s);
170        }
171        // Duplicates must be Arc-equal to their first occurrence.
172        assert!(Arc::ptr_eq(&results[0], &results[3]), "dup 'a' must share Arc");
173        assert!(Arc::ptr_eq(&results[1], &results[4]), "dup 'b' must share Arc");
174    }
175}