Skip to main content

sqry_core/graph/unified/storage/
interner.rs

1//! `StringInterner`: Node name deduplication with reference counting.
2//!
3//! This module implements `StringInterner`, which provides efficient storage
4//! of strings by deduplicating identical values.
5//!
6//! # Design
7//!
8//! - **Deduplication**: Same string returns same `StringId`
9//! - **Reference counting**: Tracks usage for GC eligibility
10//! - **Thread-safe**: Uses `Arc<str>` for shared ownership
11//!
12//! # Memory Layout
13//!
14//! ```text
15//! StringInterner:
16//! ┌─────────────────────────────────────────────┐
17//! │ strings: Vec<Arc<str>>                      │  Indexed by StringId
18//! │ lookup: HashMap<Arc<str>, u32>              │  String → index
19//! │ ref_counts: Vec<u32>                        │  Reference counts
20//! │ free_list: Vec<u32>                         │  Recycled slots
21//! │ lookup_stale: bool                          │  Invariant guard
22//! └─────────────────────────────────────────────┘
23//! ```
24//!
25//! # Lookup Staleness Invariant
26//!
27//! When bulk operations (`alloc_range`, `bulk_slices_mut`) write string
28//! slots without updating the `lookup` `HashMap`, the `lookup_stale` flag
29//! is set. Methods that depend on `lookup` correctness (`intern`, `get`,
30//! `contains`, `len`, `is_empty`, `recycle_unreferenced`) assert that
31//! this flag is `false`. The flag is cleared by `build_dedup_table()`
32//! (which rebuilds the lookup) or `truncate_to()` (which rolls back to
33//! pre-allocation state).
34
35use std::collections::HashMap;
36use std::fmt;
37use std::sync::Arc;
38
39use serde::{Deserialize, Serialize};
40
41use super::super::string::id::StringId;
42
43/// Error returned when a `StringId` is invalid or unresolvable.
44#[derive(Debug, Clone, PartialEq, Eq)]
45pub struct ResolveError {
46    /// The invalid `StringId`
47    pub id: StringId,
48}
49
50impl fmt::Display for ResolveError {
51    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
52        write!(f, "failed to resolve StringId {:?}", self.id)
53    }
54}
55
56impl std::error::Error for ResolveError {}
57
58/// Error returned when interning a string fails.
59#[derive(Debug, Clone, PartialEq, Eq)]
60pub enum InternError {
61    /// The interner has exhausted all available IDs (> 2^32 - 2 strings).
62    CapacityExhausted,
63}
64
65impl fmt::Display for InternError {
66    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
67        match self {
68            Self::CapacityExhausted => {
69                write!(f, "string interner capacity exhausted (> 2^32 - 2 strings)")
70            }
71        }
72    }
73}
74
75impl std::error::Error for InternError {}
76
77/// String interner for symbol name deduplication.
78///
79/// `StringInterner` stores strings efficiently by maintaining a single
80/// copy of each unique string. When the same string is interned multiple
81/// times, the same `StringId` is returned.
82///
83/// # Reference Counting
84///
85/// Each interned string has an associated reference count. This enables
86/// garbage collection of unused strings during compaction phases.
87///
88/// # Thread Safety
89///
90/// The interner uses `Arc<str>` for string storage, making it safe to
91/// share resolved strings across threads. However, the interner itself
92/// requires external synchronization (e.g., `RwLock`) for concurrent access.
93///
94/// # Example
95///
96/// ```rust,ignore
97/// let mut interner = StringInterner::new();
98///
99/// let id1 = interner.intern("foo");
100/// let id2 = interner.intern("foo");
101/// assert_eq!(id1, id2); // Same string → same ID
102///
103/// let resolved = interner.resolve(id1).unwrap();
104/// assert_eq!(&*resolved, "foo");
105/// ```
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct StringInterner {
108    /// Storage of interned strings, indexed by `StringId`.
109    strings: Vec<Option<Arc<str>>>,
110    /// Reverse lookup from string to index.
111    #[serde(with = "super::serde_helpers::sorted_arc_str_map")]
112    lookup: HashMap<Arc<str>, u32>,
113    /// Reference count for each string.
114    ref_counts: Vec<u32>,
115    /// Free list of recycled slot indices.
116    free_list: Vec<u32>,
117    /// Optional maximum number of IDs (for testing error paths).
118    ///
119    /// When set, the interner will return `InternError::CapacityExhausted`
120    /// when trying to allocate more than `max_ids` strings.
121    /// When `None`, uses the default limit of `u32::MAX - 1`.
122    #[serde(skip, default)]
123    max_ids: Option<u32>,
124    /// Invariant guard: `true` when bulk operations (e.g., `alloc_range`,
125    /// `bulk_slices_mut`) have written string slots without updating the
126    /// `lookup` `HashMap`. Methods that depend on `lookup` correctness
127    /// (e.g., `intern`, `get`, `contains`) assert this is `false`.
128    ///
129    /// Cleared by `build_dedup_table()`, which rebuilds `lookup` from
130    /// the canonical slot entries.
131    #[serde(skip, default)]
132    lookup_stale: bool,
133}
134
135impl StringInterner {
136    /// Creates a new empty string interner.
137    #[must_use]
138    pub fn new() -> Self {
139        // Reserve index 0 for INVALID sentinel
140        Self {
141            strings: vec![None],
142            lookup: HashMap::new(),
143            ref_counts: vec![0],
144            free_list: Vec::new(),
145            max_ids: None,
146            lookup_stale: false,
147        }
148    }
149
150    /// Creates a new interner with the specified capacity.
151    #[must_use]
152    pub fn with_capacity(capacity: usize) -> Self {
153        let mut strings = Vec::with_capacity(capacity + 1);
154        let mut ref_counts = Vec::with_capacity(capacity + 1);
155        strings.push(None); // Reserve index 0
156        ref_counts.push(0);
157
158        Self {
159            strings,
160            lookup: HashMap::with_capacity(capacity),
161            ref_counts,
162            free_list: Vec::new(),
163            max_ids: None,
164            lookup_stale: false,
165        }
166    }
167
168    /// Creates a new interner with a hard limit on the number of IDs.
169    ///
170    /// This constructor is designed for **testing error paths**. It allows
171    /// deterministic testing of `InternError::CapacityExhausted` handling
172    /// without requiring billions of strings.
173    ///
174    /// # Arguments
175    ///
176    /// * `max_ids` - Maximum number of unique strings that can be interned.
177    ///   Once this limit is reached, `intern()` will return
178    ///   `InternError::CapacityExhausted`.
179    ///
180    /// # Example
181    ///
182    /// ```rust,ignore
183    /// // Create an interner that can only hold 3 strings
184    /// let mut interner = StringInterner::with_max_ids(3);
185    ///
186    /// interner.intern("a").unwrap(); // OK
187    /// interner.intern("b").unwrap(); // OK
188    /// interner.intern("c").unwrap(); // OK
189    /// assert!(interner.intern("d").is_err()); // CapacityExhausted
190    /// ```
191    #[must_use]
192    pub fn with_max_ids(max_ids: u32) -> Self {
193        Self {
194            strings: vec![None],
195            lookup: HashMap::new(),
196            ref_counts: vec![0],
197            free_list: Vec::new(),
198            max_ids: Some(max_ids),
199            lookup_stale: false,
200        }
201    }
202
203    /// Returns the number of interned strings (excluding INVALID slot).
204    ///
205    /// # Panics
206    ///
207    /// Panics if the lookup is stale (bulk slots written without rebuild).
208    #[inline]
209    #[must_use]
210    pub fn len(&self) -> usize {
211        assert!(
212            !self.lookup_stale,
213            "StringInterner::len() called while lookup is stale \
214             (bulk slots written without rebuild). Call build_dedup_table() first."
215        );
216        self.lookup.len()
217    }
218
219    /// Returns true if no strings are interned.
220    ///
221    /// # Panics
222    ///
223    /// Panics if the lookup is stale (bulk slots written without rebuild).
224    #[inline]
225    #[must_use]
226    pub fn is_empty(&self) -> bool {
227        assert!(
228            !self.lookup_stale,
229            "StringInterner::is_empty() called while lookup is stale \
230             (bulk slots written without rebuild). Call build_dedup_table() first."
231        );
232        self.lookup.is_empty()
233    }
234
235    /// Interns a string and returns its `StringId`.
236    ///
237    /// If the string was already interned, returns the existing ID and
238    /// increments its reference count. Otherwise, allocates a new ID.
239    ///
240    /// # Errors
241    ///
242    /// Returns `InternError::CapacityExhausted` if the interner has
243    /// exhausted all available IDs (> 2^32 - 2 strings), or if `max_ids`
244    /// is set and the limit has been reached.
245    ///
246    /// # Panics
247    ///
248    /// Panics if the lookup is stale and has not been rebuilt with
249    /// `build_dedup_table()`.
250    pub fn intern(&mut self, s: &str) -> Result<StringId, InternError> {
251        assert!(
252            !self.lookup_stale,
253            "StringInterner::intern() called while lookup is stale \
254             (bulk slots written without rebuild). Call build_dedup_table() first."
255        );
256        // Check if already interned
257        if let Some(&index) = self.lookup.get(s) {
258            // Increment reference count
259            self.ref_counts[index as usize] = self.ref_counts[index as usize].saturating_add(1);
260            return Ok(StringId::new(index));
261        }
262
263        // Check max_ids limit if set (used for testing)
264        if let Some(max) = self.max_ids
265            && self.lookup.len() >= max as usize
266        {
267            return Err(InternError::CapacityExhausted);
268        }
269
270        // Allocate new slot
271        let arc_str: Arc<str> = Arc::from(s);
272        let index = if let Some(free_idx) = self.free_list.pop() {
273            // Reuse a recycled slot
274            self.strings[free_idx as usize] = Some(Arc::clone(&arc_str));
275            self.ref_counts[free_idx as usize] = 1;
276            free_idx
277        } else {
278            // Append new slot
279            let idx = self.strings.len();
280            let idx_u32 = u32::try_from(idx).map_err(|_| InternError::CapacityExhausted)?;
281            if idx_u32 == u32::MAX {
282                return Err(InternError::CapacityExhausted);
283            }
284            // Reserve the high bit for staging-local IDs.
285            if idx_u32 & StringId::LOCAL_TAG_BIT != 0 {
286                return Err(InternError::CapacityExhausted);
287            }
288            self.strings.push(Some(Arc::clone(&arc_str)));
289            self.ref_counts.push(1);
290            idx_u32
291        };
292
293        self.lookup.insert(arc_str, index);
294        Ok(StringId::new(index))
295    }
296
297    /// Interns a string and returns its `StringId` without incrementing ref count.
298    ///
299    /// This is useful when the string is being stored in a structure that
300    /// will manage its own lifetime (e.g., node entry).
301    ///
302    /// # Errors
303    ///
304    /// Returns `InternError::CapacityExhausted` if the interner has
305    /// exhausted all available IDs (> 2^32 - 2 strings), or if `max_ids`
306    /// is set and the limit has been reached.
307    ///
308    /// # Panics
309    ///
310    /// Panics if the lookup is stale and has not been rebuilt with
311    /// `build_dedup_table()`.
312    #[allow(dead_code)] // Used by graph builder (Step 20)
313    pub fn intern_without_ref(&mut self, s: &str) -> Result<StringId, InternError> {
314        assert!(
315            !self.lookup_stale,
316            "StringInterner::intern_without_ref() called while lookup is stale \
317             (bulk slots written without rebuild). Call build_dedup_table() first."
318        );
319        // Check if already interned
320        if let Some(&index) = self.lookup.get(s) {
321            return Ok(StringId::new(index));
322        }
323
324        // Check max_ids limit if set (used for testing)
325        if let Some(max) = self.max_ids
326            && self.lookup.len() >= max as usize
327        {
328            return Err(InternError::CapacityExhausted);
329        }
330
331        // Allocate new slot with ref_count = 0
332        let arc_str: Arc<str> = Arc::from(s);
333        let index = if let Some(free_idx) = self.free_list.pop() {
334            self.strings[free_idx as usize] = Some(Arc::clone(&arc_str));
335            self.ref_counts[free_idx as usize] = 0;
336            free_idx
337        } else {
338            let idx = self.strings.len();
339            let idx_u32 = u32::try_from(idx).map_err(|_| InternError::CapacityExhausted)?;
340            if idx_u32 == u32::MAX {
341                return Err(InternError::CapacityExhausted);
342            }
343            // Reserve the high bit for staging-local IDs.
344            if idx_u32 & StringId::LOCAL_TAG_BIT != 0 {
345                return Err(InternError::CapacityExhausted);
346            }
347            self.strings.push(Some(Arc::clone(&arc_str)));
348            self.ref_counts.push(0);
349            idx_u32
350        };
351
352        self.lookup.insert(arc_str, index);
353        Ok(StringId::new(index))
354    }
355
356    /// Resolves a `StringId` to its string value.
357    ///
358    /// Returns `None` if the ID is invalid or has been recycled.
359    #[must_use]
360    pub fn resolve(&self, id: StringId) -> Option<Arc<str>> {
361        if id.is_invalid() {
362            return None;
363        }
364
365        let index = id.index() as usize;
366        self.strings.get(index).and_then(std::clone::Clone::clone)
367    }
368
369    /// Returns the reference count for a string.
370    ///
371    /// Returns 0 if the ID is invalid or has been recycled.
372    #[must_use]
373    pub fn ref_count(&self, id: StringId) -> u32 {
374        if id.is_invalid() {
375            return 0;
376        }
377
378        let index = id.index() as usize;
379        self.ref_counts.get(index).copied().unwrap_or(0)
380    }
381
382    /// Increments the reference count for a string.
383    ///
384    /// Returns the new count, or None if the ID is invalid.
385    pub fn inc_ref(&mut self, id: StringId) -> Option<u32> {
386        if id.is_invalid() {
387            return None;
388        }
389
390        let index = id.index() as usize;
391        if index < self.ref_counts.len() && self.strings[index].is_some() {
392            self.ref_counts[index] = self.ref_counts[index].saturating_add(1);
393            Some(self.ref_counts[index])
394        } else {
395            None
396        }
397    }
398
399    /// Decrements the reference count for a string.
400    ///
401    /// Returns the new count, or None if the ID is invalid.
402    /// Note: This does NOT automatically recycle the string when count reaches 0.
403    /// Use `recycle_unreferenced()` during compaction for that.
404    pub fn dec_ref(&mut self, id: StringId) -> Option<u32> {
405        if id.is_invalid() {
406            return None;
407        }
408
409        let index = id.index() as usize;
410        if index < self.ref_counts.len() && self.strings[index].is_some() {
411            self.ref_counts[index] = self.ref_counts[index].saturating_sub(1);
412            Some(self.ref_counts[index])
413        } else {
414            None
415        }
416    }
417
418    /// Recycles all strings with zero reference count.
419    ///
420    /// Returns the number of strings recycled.
421    /// This should be called during compaction phases.
422    ///
423    /// # Panics
424    ///
425    /// Panics if the lookup is stale (bulk slots written without rebuild).
426    #[allow(dead_code)] // Used by Compaction (Step 15)
427    pub fn recycle_unreferenced(&mut self) -> usize {
428        assert!(
429            !self.lookup_stale,
430            "StringInterner::recycle_unreferenced() called while lookup is stale \
431             (bulk slots written without rebuild). Call build_dedup_table() first."
432        );
433        let mut recycled = 0;
434
435        for index in 1..self.strings.len() {
436            if self.ref_counts[index] == 0
437                && let Some(arc_str) = self.strings[index].take()
438            {
439                self.lookup.remove(&arc_str);
440                if let Ok(index_u32) = u32::try_from(index) {
441                    self.free_list.push(index_u32);
442                }
443                recycled += 1;
444            }
445        }
446
447        recycled
448    }
449
450    /// Checks if a string is interned.
451    ///
452    /// # Panics
453    ///
454    /// Panics if the lookup is stale (bulk slots written without rebuild).
455    #[must_use]
456    pub fn contains(&self, s: &str) -> bool {
457        assert!(
458            !self.lookup_stale,
459            "StringInterner::contains() called while lookup is stale \
460             (bulk slots written without rebuild). Call build_dedup_table() first."
461        );
462        self.lookup.contains_key(s)
463    }
464
465    /// Gets the `StringId` for a string if it's already interned.
466    ///
467    /// Unlike `intern()`, this does not create a new entry or modify ref counts.
468    ///
469    /// # Panics
470    ///
471    /// Panics if the lookup is stale (bulk slots written without rebuild).
472    #[must_use]
473    pub fn get(&self, s: &str) -> Option<StringId> {
474        assert!(
475            !self.lookup_stale,
476            "StringInterner::get() called while lookup is stale \
477             (bulk slots written without rebuild). Call build_dedup_table() first."
478        );
479        self.lookup.get(s).map(|&idx| StringId::new(idx))
480    }
481
482    /// Returns an iterator over all interned strings with their IDs.
483    pub fn iter(&self) -> impl Iterator<Item = (StringId, &Arc<str>)> {
484        self.strings
485            .iter()
486            .enumerate()
487            .skip(1) // Skip INVALID slot
488            .filter_map(|(idx, opt)| {
489                let index_u32 = u32::try_from(idx).ok()?;
490                opt.as_ref().map(|s| (StringId::new(index_u32), s))
491            })
492    }
493
494    /// Clears all interned strings.
495    ///
496    /// Resets the interner to empty state, including clearing the
497    /// `lookup_stale` flag (lookup is trivially consistent when empty).
498    pub fn clear(&mut self) {
499        self.strings.truncate(1); // Keep INVALID slot
500        self.strings[0] = None;
501        self.lookup.clear();
502        self.ref_counts.truncate(1);
503        self.ref_counts[0] = 0;
504        self.free_list.clear();
505        self.lookup_stale = false;
506    }
507
508    /// Reserves capacity for at least `additional` more strings.
509    pub fn reserve(&mut self, additional: usize) {
510        self.strings.reserve(additional);
511        self.ref_counts.reserve(additional);
512        self.lookup.reserve(additional);
513    }
514
515    /// Pre-allocates `count` string slots for bulk parallel commit.
516    ///
517    /// The new slots are initialized with `None` (no string) and `ref_count = 0`.
518    /// Returns the start index of the allocated range. The caller can then
519    /// fill slots `start..start+count` via [`StringInterner::bulk_slices_mut`].
520    ///
521    /// This method does **not** touch the `free_list` — it always appends to the
522    /// end of the `strings` and `ref_counts` vectors. This is intentional:
523    /// during parallel commit, each file gets a contiguous, non-overlapping range.
524    ///
525    /// # Errors
526    ///
527    /// Returns `InternError::CapacityExhausted` if the allocation would exceed
528    /// the `LOCAL_TAG_BIT` boundary (2^31 indices reserved for global IDs).
529    ///
530    /// # Arguments
531    ///
532    /// * `count` - Number of slots to pre-allocate. If 0, this is a no-op
533    ///   returning the current length.
534    pub fn alloc_range(&mut self, count: u32) -> Result<u32, InternError> {
535        let start = self.strings.len();
536        let start_u32 = u32::try_from(start).map_err(|_| InternError::CapacityExhausted)?;
537
538        if count == 0 {
539            return Ok(start_u32);
540        }
541
542        // Check that the last index (start + count - 1) does not hit LOCAL_TAG_BIT
543        let end_u32 = start_u32
544            .checked_add(count)
545            .ok_or(InternError::CapacityExhausted)?;
546        // end_u32 is one-past-last, so the last valid index is end_u32 - 1
547        if (end_u32 - 1) & StringId::LOCAL_TAG_BIT != 0 {
548            return Err(InternError::CapacityExhausted);
549        }
550
551        // Extend with None slots and zero ref_counts
552        self.strings.resize(end_u32 as usize, None);
553        self.ref_counts.resize(end_u32 as usize, 0);
554
555        // Mark lookup as stale — bulk slots bypass the lookup HashMap.
556        // Callers must call build_dedup_table() before using lookup-dependent
557        // methods (intern, get, contains, len, is_empty).
558        self.lookup_stale = true;
559
560        Ok(start_u32)
561    }
562
563    /// Returns mutable sub-slices into the `strings` and `ref_counts` arrays for
564    /// the range `start..start+count`.
565    ///
566    /// This enables parallel file commit workers to write directly into their
567    /// pre-allocated range without contention. The caller is responsible for
568    /// ensuring no overlapping ranges are accessed concurrently.
569    ///
570    /// Defensively marks the lookup as stale when `count > 0`, since the
571    /// returned slices allow direct mutation of string slots without updating
572    /// the `lookup` `HashMap`.
573    ///
574    /// # Panics
575    ///
576    /// Panics if `start + count` exceeds the current vector length.
577    pub fn bulk_slices_mut(
578        &mut self,
579        start: u32,
580        count: u32,
581    ) -> (&mut [Option<Arc<str>>], &mut [u32]) {
582        if count > 0 {
583            self.lookup_stale = true;
584        }
585        let s = start as usize;
586        let e = s + count as usize;
587        let strings_slice = &mut self.strings[s..e];
588        let ref_counts_slice = &mut self.ref_counts[s..e];
589        (strings_slice, ref_counts_slice)
590    }
591
592    /// Scans all string slots and deduplicates identical strings.
593    ///
594    /// After parallel commit, multiple file workers may have inserted the same
595    /// string into different slots. This method:
596    ///
597    /// 1. Iterates slots `1..N` in index order (deterministic).
598    /// 2. For the first occurrence of each string value, that slot becomes the
599    ///    **canonical** entry.
600    /// 3. For duplicate occurrences, their `ref_count` is accumulated into the
601    ///    canonical slot, and the duplicate slot is cleared (`None`, `ref_count = 0`).
602    /// 4. The `lookup` `HashMap` is rebuilt from canonical entries only.
603    ///
604    /// Returns a remap table mapping duplicate `StringId` to canonical `StringId`.
605    /// Canonical entries are **not** included in the returned map.
606    pub fn build_dedup_table(&mut self) -> HashMap<StringId, StringId> {
607        let mut remap: HashMap<StringId, StringId> = HashMap::new();
608        // Track first-seen: string value -> canonical index
609        let mut canonical: HashMap<Arc<str>, u32> = HashMap::new();
610
611        for idx in 1..self.strings.len() {
612            let Some(ref arc_str) = self.strings[idx] else {
613                continue;
614            };
615
616            if let Some(&canon_idx) = canonical.get(arc_str) {
617                // Duplicate: accumulate ref_count into canonical
618                let dup_rc = self.ref_counts[idx];
619                self.ref_counts[canon_idx as usize] =
620                    self.ref_counts[canon_idx as usize].saturating_add(dup_rc);
621                // Clear duplicate slot and add to free_list for reuse
622                self.strings[idx] = None;
623                self.ref_counts[idx] = 0;
624                let idx_u32 = u32::try_from(idx)
625                    .unwrap_or_else(|_| unreachable!("string slot index exceeds u32 invariant"));
626                self.free_list.push(idx_u32);
627                // Record remap
628                remap.insert(StringId::new(idx_u32), StringId::new(canon_idx));
629            } else {
630                // First occurrence — this is canonical
631                let idx_u32 = u32::try_from(idx)
632                    .unwrap_or_else(|_| unreachable!("string slot index exceeds u32 invariant"));
633                canonical.insert(Arc::clone(arc_str), idx_u32);
634            }
635        }
636
637        // Rebuild lookup from canonical entries only
638        self.lookup.clear();
639        self.lookup.reserve(canonical.len());
640        for (arc_str, idx) in canonical {
641            self.lookup.insert(arc_str, idx);
642        }
643
644        // Lookup is now consistent with slot contents — clear stale flag.
645        self.lookup_stale = false;
646
647        remap
648    }
649
650    /// Truncates the `strings` and `ref_counts` vectors to `saved_len`.
651    ///
652    /// This rolls back a failed bulk allocation by removing all slots at
653    /// index `saved_len` and beyond. The `lookup` `HashMap` is **not** modified
654    /// (the caller is responsible for ensuring no lookup entries point to the
655    /// truncated region).
656    ///
657    /// # Panics
658    ///
659    /// Panics if `saved_len` is 0 (would remove the sentinel slot).
660    pub fn truncate_to(&mut self, saved_len: usize) {
661        assert!(saved_len >= 1, "cannot truncate sentinel slot at index 0");
662        self.strings.truncate(saved_len);
663        self.ref_counts.truncate(saved_len);
664        // Rolling back to pre-allocation state restores lookup consistency:
665        // the lookup only contains entries for slots that existed before
666        // the bulk allocation, all of which are still valid after truncation.
667        self.lookup_stale = false;
668    }
669
670    /// Returns the total number of string slots including the sentinel at index 0.
671    ///
672    /// This is the raw vector length, not the number of interned strings.
673    /// Useful for saving/restoring allocation state.
674    #[inline]
675    #[must_use]
676    pub fn string_count_raw(&self) -> usize {
677        self.strings.len()
678    }
679
680    /// Returns whether the lookup `HashMap` is stale (bulk slots written
681    /// without a `build_dedup_table()` rebuild).
682    ///
683    /// This is primarily useful for testing and diagnostics.
684    #[inline]
685    #[must_use]
686    pub fn is_lookup_stale(&self) -> bool {
687        self.lookup_stale
688    }
689
690    /// Returns statistics about the interner.
691    ///
692    /// Safe to call even when lookup is stale — uses slot-based counting
693    /// instead of lookup length.
694    #[must_use]
695    pub fn stats(&self) -> InternerStats {
696        let total_bytes: usize = self
697            .strings
698            .iter()
699            .filter_map(|opt| opt.as_ref())
700            .map(|s| s.len())
701            .sum();
702
703        // Count occupied slots directly — safe regardless of lookup state.
704        let string_count = self
705            .strings
706            .iter()
707            .skip(1) // skip sentinel
708            .filter(|s| s.is_some())
709            .count();
710
711        InternerStats {
712            string_count,
713            total_bytes,
714            free_slots: self.free_list.len(),
715            capacity: self.strings.capacity(),
716        }
717    }
718}
719
720impl Default for StringInterner {
721    fn default() -> Self {
722        Self::new()
723    }
724}
725
726impl fmt::Display for StringInterner {
727    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
728        // Use slot-based count (safe regardless of lookup state).
729        let count = self.stats().string_count;
730        write!(
731            f,
732            "StringInterner(strings={count}, free={}{})",
733            self.free_list.len(),
734            if self.lookup_stale {
735                ", lookup_stale"
736            } else {
737                ""
738            }
739        )
740    }
741}
742
743/// Statistics about a `StringInterner`.
744#[derive(Debug, Clone, Copy, PartialEq, Eq)]
745pub struct InternerStats {
746    /// Number of interned strings.
747    pub string_count: usize,
748    /// Total bytes of string data.
749    pub total_bytes: usize,
750    /// Number of free (recyclable) slots.
751    pub free_slots: usize,
752    /// Allocated capacity.
753    pub capacity: usize,
754}
755
756#[cfg(test)]
757mod tests {
758    use super::*;
759
760    #[test]
761    fn test_new() {
762        let interner = StringInterner::new();
763        assert_eq!(interner.len(), 0);
764        assert!(interner.is_empty());
765    }
766
767    #[test]
768    fn test_with_capacity() {
769        let interner = StringInterner::with_capacity(100);
770        assert_eq!(interner.len(), 0);
771        assert!(interner.strings.capacity() >= 101); // +1 for INVALID slot
772    }
773
774    #[test]
775    fn test_intern_single() {
776        let mut interner = StringInterner::new();
777        let id = interner.intern("hello").unwrap();
778
779        assert!(!id.is_invalid());
780        assert_eq!(interner.len(), 1);
781
782        let resolved = interner.resolve(id).unwrap();
783        assert_eq!(&*resolved, "hello");
784    }
785
786    #[test]
787    fn test_intern_duplicate() {
788        let mut interner = StringInterner::new();
789        let id1 = interner.intern("foo").unwrap();
790        let id2 = interner.intern("foo").unwrap();
791
792        assert_eq!(id1, id2);
793        assert_eq!(interner.len(), 1);
794        assert_eq!(interner.ref_count(id1), 2);
795    }
796
797    #[test]
798    fn test_intern_different() {
799        let mut interner = StringInterner::new();
800        let id1 = interner.intern("foo").unwrap();
801        let id2 = interner.intern("bar").unwrap();
802
803        assert_ne!(id1, id2);
804        assert_eq!(interner.len(), 2);
805    }
806
807    #[test]
808    fn test_resolve_invalid() {
809        let interner = StringInterner::new();
810        assert!(interner.resolve(StringId::INVALID).is_none());
811    }
812
813    #[test]
814    fn test_resolve_out_of_bounds() {
815        let interner = StringInterner::new();
816        assert!(interner.resolve(StringId::new(999)).is_none());
817    }
818
819    #[test]
820    fn test_ref_count() {
821        let mut interner = StringInterner::new();
822        let id = interner.intern("test").unwrap();
823
824        assert_eq!(interner.ref_count(id), 1);
825
826        interner.intern("test").unwrap(); // Same string
827        assert_eq!(interner.ref_count(id), 2);
828
829        interner.dec_ref(id);
830        assert_eq!(interner.ref_count(id), 1);
831
832        interner.inc_ref(id);
833        assert_eq!(interner.ref_count(id), 2);
834    }
835
836    #[test]
837    fn test_inc_ref_invalid() {
838        let mut interner = StringInterner::new();
839        assert!(interner.inc_ref(StringId::INVALID).is_none());
840    }
841
842    #[test]
843    fn test_dec_ref_invalid() {
844        let mut interner = StringInterner::new();
845        assert!(interner.dec_ref(StringId::INVALID).is_none());
846    }
847
848    #[test]
849    fn test_dec_ref_saturating() {
850        let mut interner = StringInterner::new();
851        let id = interner.intern_without_ref("test").unwrap();
852
853        assert_eq!(interner.ref_count(id), 0);
854        interner.dec_ref(id);
855        assert_eq!(interner.ref_count(id), 0); // Saturates at 0
856    }
857
858    #[test]
859    fn test_intern_without_ref() {
860        let mut interner = StringInterner::new();
861        let id = interner.intern_without_ref("test").unwrap();
862
863        assert_eq!(interner.ref_count(id), 0);
864        assert_eq!(interner.resolve(id).unwrap().as_ref(), "test");
865    }
866
867    #[test]
868    fn test_recycle_unreferenced() {
869        let mut interner = StringInterner::new();
870        let id1 = interner.intern_without_ref("foo").unwrap();
871        let id2 = interner.intern("bar").unwrap();
872
873        assert_eq!(interner.len(), 2);
874
875        let recycled = interner.recycle_unreferenced();
876        assert_eq!(recycled, 1); // Only "foo" (ref_count = 0)
877        assert_eq!(interner.len(), 1);
878
879        // id1 should now be invalid
880        assert!(interner.resolve(id1).is_none());
881        // id2 should still work
882        assert_eq!(interner.resolve(id2).unwrap().as_ref(), "bar");
883    }
884
885    #[test]
886    fn test_free_list_reuse() {
887        let mut interner = StringInterner::new();
888        let id1 = interner.intern_without_ref("foo").unwrap();
889        let _id2 = interner.intern("bar").unwrap();
890
891        // Recycle "foo"
892        interner.recycle_unreferenced();
893
894        // New string should reuse the slot
895        let id3 = interner.intern("baz").unwrap();
896        assert_eq!(id3.index(), id1.index()); // Same slot reused
897    }
898
899    #[test]
900    fn test_contains() {
901        let mut interner = StringInterner::new();
902        interner.intern("hello").unwrap();
903
904        assert!(interner.contains("hello"));
905        assert!(!interner.contains("world"));
906    }
907
908    #[test]
909    fn test_get() {
910        let mut interner = StringInterner::new();
911        let id = interner.intern("hello").unwrap();
912
913        assert_eq!(interner.get("hello"), Some(id));
914        assert_eq!(interner.get("world"), None);
915    }
916
917    #[test]
918    fn test_iter() {
919        let mut interner = StringInterner::new();
920        interner.intern("foo").unwrap();
921        interner.intern("bar").unwrap();
922        interner.intern("baz").unwrap();
923
924        let strings: Vec<_> = interner.iter().map(|(_, s)| s.as_ref()).collect();
925        assert_eq!(strings.len(), 3);
926        assert!(strings.contains(&"foo"));
927        assert!(strings.contains(&"bar"));
928        assert!(strings.contains(&"baz"));
929    }
930
931    #[test]
932    fn test_clear() {
933        let mut interner = StringInterner::new();
934        interner.intern("foo").unwrap();
935        interner.intern("bar").unwrap();
936
937        assert_eq!(interner.len(), 2);
938        interner.clear();
939        assert_eq!(interner.len(), 0);
940        assert!(interner.is_empty());
941    }
942
943    #[test]
944    fn test_reserve() {
945        let mut interner = StringInterner::new();
946        interner.reserve(1000);
947        assert!(interner.strings.capacity() >= 1001);
948    }
949
950    #[test]
951    fn test_display() {
952        let mut interner = StringInterner::new();
953        interner.intern("test").unwrap();
954
955        let display = format!("{interner}");
956        assert!(display.contains("StringInterner"));
957        assert!(display.contains("strings=1"));
958    }
959
960    #[test]
961    fn test_stats() {
962        let mut interner = StringInterner::new();
963        interner.intern("hello").unwrap(); // 5 bytes
964        interner.intern("world").unwrap(); // 5 bytes
965
966        let stats = interner.stats();
967        assert_eq!(stats.string_count, 2);
968        assert_eq!(stats.total_bytes, 10);
969        assert_eq!(stats.free_slots, 0);
970    }
971
972    #[test]
973    fn test_empty_string() {
974        let mut interner = StringInterner::new();
975        let id = interner.intern("").unwrap();
976
977        assert!(!id.is_invalid());
978        assert_eq!(interner.resolve(id).unwrap().as_ref(), "");
979    }
980
981    #[test]
982    fn test_unicode() {
983        let mut interner = StringInterner::new();
984        let id = interner.intern("日本語").unwrap();
985
986        let resolved = interner.resolve(id).unwrap();
987        assert_eq!(&*resolved, "日本語");
988    }
989
990    #[test]
991    fn test_default() {
992        let interner: StringInterner = StringInterner::default();
993        assert_eq!(interner.len(), 0);
994    }
995
996    #[test]
997    fn test_resolve_error_display() {
998        let err = ResolveError {
999            id: StringId::new(42),
1000        };
1001        let display = format!("{err}");
1002        assert!(display.contains("StringId"));
1003    }
1004
1005    #[test]
1006    fn test_intern_error_display() {
1007        let err = InternError::CapacityExhausted;
1008        let display = format!("{err}");
1009        assert!(display.contains("capacity exhausted"));
1010    }
1011
1012    // ── Bulk API tests for parallel commit ──────────────────────────────
1013
1014    #[test]
1015    fn test_alloc_range_basic() {
1016        let mut interner = StringInterner::new();
1017        // Sentinel exists at index 0
1018        assert_eq!(interner.string_count_raw(), 1);
1019
1020        let start = interner.alloc_range(5).unwrap();
1021        // Start must be >= 1 (sentinel at 0 is already there)
1022        assert!(start >= 1);
1023        assert_eq!(start, 1); // New interner, first alloc starts right after sentinel
1024        assert_eq!(interner.string_count_raw(), 6); // 1 sentinel + 5 allocated
1025
1026        // All new slots should be None with ref_count 0
1027        for i in start..start + 5 {
1028            let id = StringId::new(i);
1029            assert!(interner.resolve(id).is_none());
1030            assert_eq!(interner.ref_count(id), 0);
1031        }
1032    }
1033
1034    #[test]
1035    fn test_alloc_range_zero_noop() {
1036        let mut interner = StringInterner::new();
1037        let before = interner.string_count_raw();
1038        let start = interner.alloc_range(0).unwrap();
1039        assert_eq!(start as usize, before);
1040        assert_eq!(interner.string_count_raw(), before);
1041    }
1042
1043    #[test]
1044    fn test_alloc_range_after_existing_strings() {
1045        let mut interner = StringInterner::new();
1046        interner.intern("existing").unwrap();
1047        // strings = [None, Some("existing")] → len = 2
1048        let start = interner.alloc_range(3).unwrap();
1049        assert_eq!(start, 2);
1050        assert_eq!(interner.string_count_raw(), 5);
1051        // Existing string still works
1052        assert_eq!(
1053            interner.resolve(StringId::new(1)).unwrap().as_ref(),
1054            "existing"
1055        );
1056    }
1057
1058    #[test]
1059    fn test_bulk_slices_mut() {
1060        let mut interner = StringInterner::new();
1061        let start = interner.alloc_range(3).unwrap();
1062
1063        // Write strings into the pre-allocated slots
1064        {
1065            let (strings, ref_counts) = interner.bulk_slices_mut(start, 3);
1066            strings[0] = Some(Arc::from("alpha"));
1067            strings[1] = Some(Arc::from("beta"));
1068            strings[2] = Some(Arc::from("gamma"));
1069            ref_counts[0] = 1;
1070            ref_counts[1] = 2;
1071            ref_counts[2] = 1;
1072        }
1073
1074        // Verify via resolve()
1075        assert_eq!(
1076            interner.resolve(StringId::new(start)).unwrap().as_ref(),
1077            "alpha"
1078        );
1079        assert_eq!(
1080            interner.resolve(StringId::new(start + 1)).unwrap().as_ref(),
1081            "beta"
1082        );
1083        assert_eq!(
1084            interner.resolve(StringId::new(start + 2)).unwrap().as_ref(),
1085            "gamma"
1086        );
1087
1088        // Verify ref_counts
1089        assert_eq!(interner.ref_count(StringId::new(start)), 1);
1090        assert_eq!(interner.ref_count(StringId::new(start + 1)), 2);
1091        assert_eq!(interner.ref_count(StringId::new(start + 2)), 1);
1092    }
1093
1094    #[test]
1095    fn test_build_dedup_table_no_duplicates() {
1096        let mut interner = StringInterner::new();
1097        let start = interner.alloc_range(3).unwrap();
1098
1099        {
1100            let (strings, ref_counts) = interner.bulk_slices_mut(start, 3);
1101            strings[0] = Some(Arc::from("one"));
1102            strings[1] = Some(Arc::from("two"));
1103            strings[2] = Some(Arc::from("three"));
1104            ref_counts[0] = 1;
1105            ref_counts[1] = 1;
1106            ref_counts[2] = 1;
1107        }
1108
1109        let remap = interner.build_dedup_table();
1110        assert!(remap.is_empty(), "no duplicates means empty remap");
1111
1112        // All strings still resolvable
1113        assert_eq!(
1114            interner.resolve(StringId::new(start)).unwrap().as_ref(),
1115            "one"
1116        );
1117        assert_eq!(
1118            interner.resolve(StringId::new(start + 1)).unwrap().as_ref(),
1119            "two"
1120        );
1121        assert_eq!(
1122            interner.resolve(StringId::new(start + 2)).unwrap().as_ref(),
1123            "three"
1124        );
1125
1126        // Lookup rebuilt correctly
1127        assert_eq!(interner.len(), 3);
1128    }
1129
1130    #[test]
1131    fn test_build_dedup_table_with_duplicates() {
1132        let mut interner = StringInterner::new();
1133        let start = interner.alloc_range(4).unwrap();
1134
1135        {
1136            let (strings, ref_counts) = interner.bulk_slices_mut(start, 4);
1137            strings[0] = Some(Arc::from("hello"));
1138            strings[1] = Some(Arc::from("world"));
1139            strings[2] = Some(Arc::from("hello")); // duplicate of slot 0
1140            strings[3] = Some(Arc::from("world")); // duplicate of slot 1
1141            ref_counts[0] = 1;
1142            ref_counts[1] = 3;
1143            ref_counts[2] = 2;
1144            ref_counts[3] = 1;
1145        }
1146
1147        let remap = interner.build_dedup_table();
1148
1149        // Two duplicates should be remapped
1150        assert_eq!(remap.len(), 2);
1151        assert_eq!(remap[&StringId::new(start + 2)], StringId::new(start)); // hello dup → hello canon
1152        assert_eq!(remap[&StringId::new(start + 3)], StringId::new(start + 1)); // world dup → world canon
1153
1154        // Canonical ref_counts accumulated
1155        assert_eq!(interner.ref_count(StringId::new(start)), 3); // 1 + 2
1156        assert_eq!(interner.ref_count(StringId::new(start + 1)), 4); // 3 + 1
1157
1158        // Duplicate slots cleared
1159        assert!(interner.resolve(StringId::new(start + 2)).is_none());
1160        assert!(interner.resolve(StringId::new(start + 3)).is_none());
1161        assert_eq!(interner.ref_count(StringId::new(start + 2)), 0);
1162        assert_eq!(interner.ref_count(StringId::new(start + 3)), 0);
1163
1164        // Lookup has only unique strings
1165        assert_eq!(interner.len(), 2);
1166        assert_eq!(interner.get("hello"), Some(StringId::new(start)));
1167        assert_eq!(interner.get("world"), Some(StringId::new(start + 1)));
1168    }
1169
1170    #[test]
1171    fn test_truncate_to() {
1172        let mut interner = StringInterner::new();
1173        interner.intern("keep").unwrap();
1174
1175        let saved = interner.string_count_raw(); // 2 (sentinel + "keep")
1176        let _start = interner.alloc_range(10).unwrap();
1177        assert_eq!(interner.string_count_raw(), 12);
1178
1179        interner.truncate_to(saved);
1180        assert_eq!(interner.string_count_raw(), 2);
1181
1182        // Original string still works
1183        assert_eq!(interner.resolve(StringId::new(1)).unwrap().as_ref(), "keep");
1184    }
1185
1186    #[test]
1187    #[should_panic(expected = "cannot truncate sentinel")]
1188    fn test_truncate_to_zero_panics() {
1189        let mut interner = StringInterner::new();
1190        interner.truncate_to(0);
1191    }
1192
1193    #[test]
1194    fn test_dedup_preserves_sentinel() {
1195        let mut interner = StringInterner::new();
1196        let start = interner.alloc_range(2).unwrap();
1197
1198        {
1199            let (strings, ref_counts) = interner.bulk_slices_mut(start, 2);
1200            strings[0] = Some(Arc::from("a"));
1201            strings[1] = Some(Arc::from("a")); // duplicate
1202            ref_counts[0] = 1;
1203            ref_counts[1] = 1;
1204        }
1205
1206        let _remap = interner.build_dedup_table();
1207
1208        // Sentinel at index 0 must still be None with ref_count 0
1209        assert!(interner.resolve(StringId::new(0)).is_none());
1210        // StringId(0) is actually not INVALID (INVALID = u32::MAX), but
1211        // resolve checks the slot content which is None
1212        assert_eq!(interner.ref_counts[0], 0);
1213        assert!(interner.strings[0].is_none());
1214    }
1215
1216    #[test]
1217    fn test_string_count_raw() {
1218        let mut interner = StringInterner::new();
1219        assert_eq!(interner.string_count_raw(), 1); // sentinel only
1220
1221        interner.intern("a").unwrap();
1222        assert_eq!(interner.string_count_raw(), 2);
1223
1224        interner.alloc_range(5).unwrap();
1225        assert_eq!(interner.string_count_raw(), 7);
1226    }
1227
1228    #[test]
1229    fn test_ref_count_accessor() {
1230        let mut interner = StringInterner::new();
1231        let id = interner.intern("test").unwrap();
1232        assert_eq!(interner.ref_count(id), 1);
1233
1234        interner.intern("test").unwrap(); // increment
1235        assert_eq!(interner.ref_count(id), 2);
1236
1237        // Invalid ID returns 0
1238        assert_eq!(interner.ref_count(StringId::INVALID), 0);
1239
1240        // Out of bounds returns 0
1241        assert_eq!(interner.ref_count(StringId::new(999)), 0);
1242    }
1243
1244    #[test]
1245    fn test_alloc_range_basic_succeeds() {
1246        // Verify that alloc_range returns the correct starting index and
1247        // advances the string count by the requested amount.  The capacity
1248        // boundary (LOCAL_TAG_BIT) cannot be exercised in a unit test because
1249        // allocating 2^31 slots would be prohibitive; that path is covered by
1250        // the error handling in alloc_range itself.
1251        let mut interner = StringInterner::with_max_ids(u32::MAX);
1252        let start = interner.alloc_range(10).unwrap();
1253        assert_eq!(start, 1, "first allocation should start at index 1");
1254        assert_eq!(
1255            interner.string_count_raw(),
1256            11,
1257            "string count should advance by the allocated range size"
1258        );
1259    }
1260
1261    #[test]
1262    fn test_dedup_frees_slots_for_reuse() {
1263        // Regression test: after dedup, duplicate slots must be on free_list
1264        // so that future intern() calls reuse them instead of growing forever.
1265        let mut interner = StringInterner::new();
1266        let start = interner.alloc_range(3).unwrap();
1267
1268        {
1269            let (strings, ref_counts) = interner.bulk_slices_mut(start, 3);
1270            strings[0] = Some(Arc::from("dup"));
1271            strings[1] = Some(Arc::from("unique"));
1272            strings[2] = Some(Arc::from("dup")); // duplicate of slot 0
1273            ref_counts[0] = 1;
1274            ref_counts[1] = 1;
1275            ref_counts[2] = 1;
1276        }
1277
1278        let remap = interner.build_dedup_table();
1279        assert_eq!(remap.len(), 1);
1280
1281        // Slot start+2 was a duplicate and should now be on the free_list
1282        let dup_slot = start + 2;
1283        assert!(interner.resolve(StringId::new(dup_slot)).is_none());
1284
1285        // Intern a new string — it should reuse the freed duplicate slot
1286        let new_id = interner.intern("fresh").unwrap();
1287        assert_eq!(
1288            new_id.index(),
1289            dup_slot,
1290            "new intern should reuse freed duplicate slot"
1291        );
1292    }
1293
1294    #[test]
1295    fn test_build_dedup_table_with_gaps() {
1296        // Test dedup when there are None gaps (e.g., from recycled slots)
1297        let mut interner = StringInterner::new();
1298        let start = interner.alloc_range(4).unwrap();
1299
1300        {
1301            let (strings, ref_counts) = interner.bulk_slices_mut(start, 4);
1302            strings[0] = Some(Arc::from("x"));
1303            // strings[1] intentionally left as None (gap)
1304            strings[2] = Some(Arc::from("y"));
1305            strings[3] = Some(Arc::from("x")); // duplicate of slot 0
1306            ref_counts[0] = 1;
1307            ref_counts[2] = 1;
1308            ref_counts[3] = 1;
1309        }
1310
1311        let remap = interner.build_dedup_table();
1312
1313        // Only slot 3 (duplicate "x") should be remapped
1314        assert_eq!(remap.len(), 1);
1315        assert_eq!(remap[&StringId::new(start + 3)], StringId::new(start));
1316
1317        // Canonical "x" ref_count accumulated
1318        assert_eq!(interner.ref_count(StringId::new(start)), 2);
1319
1320        // "y" untouched
1321        assert_eq!(interner.ref_count(StringId::new(start + 2)), 1);
1322
1323        // Gap at slot 1 still None
1324        assert!(interner.resolve(StringId::new(start + 1)).is_none());
1325    }
1326
1327    // ── Deterministic serialization tests ────────────────────────────────
1328
1329    #[test]
1330    fn test_interner_deterministic_serialization() {
1331        // Two interners with the same insertion order must produce identical
1332        // postcard bytes. Without sorted lookup serialization, HashMap's
1333        // random hash seeds could cause different byte output.
1334        let mut interner1 = StringInterner::new();
1335        interner1.intern("alpha").unwrap();
1336        interner1.intern("beta").unwrap();
1337        interner1.intern("gamma").unwrap();
1338
1339        let mut interner2 = StringInterner::new();
1340        interner2.intern("alpha").unwrap();
1341        interner2.intern("beta").unwrap();
1342        interner2.intern("gamma").unwrap();
1343
1344        let bytes1 = postcard::to_stdvec(&interner1).expect("serialize interner1");
1345        let bytes2 = postcard::to_stdvec(&interner2).expect("serialize interner2");
1346
1347        assert_eq!(
1348            bytes1, bytes2,
1349            "same insertion order must produce identical postcard bytes"
1350        );
1351    }
1352
1353    #[test]
1354    fn test_interner_serialization_roundtrip() {
1355        let mut interner = StringInterner::new();
1356        interner.intern("foo").unwrap();
1357        interner.intern("bar").unwrap();
1358        interner.intern("baz").unwrap();
1359        // Intern "foo" again to bump its ref count
1360        interner.intern("foo").unwrap();
1361
1362        let bytes = postcard::to_stdvec(&interner).expect("serialize");
1363        let deserialized: StringInterner = postcard::from_bytes(&bytes).expect("deserialize");
1364
1365        // Verify all strings survived the roundtrip
1366        assert_eq!(deserialized.len(), 3);
1367        assert!(deserialized.contains("foo"));
1368        assert!(deserialized.contains("bar"));
1369        assert!(deserialized.contains("baz"));
1370
1371        // Verify ref counts survived
1372        let foo_id = deserialized.get("foo").unwrap();
1373        assert_eq!(deserialized.ref_count(foo_id), 2);
1374
1375        let bar_id = deserialized.get("bar").unwrap();
1376        assert_eq!(deserialized.ref_count(bar_id), 1);
1377    }
1378
1379    #[test]
1380    fn test_interner_sorted_lookup_produces_stable_bytes() {
1381        // Verify that the lookup map portion is sorted by inspecting the
1382        // serialized bytes. Since postcard serializes sequences in order,
1383        // "alpha" must appear before "beta" which must appear before "gamma"
1384        // in the output.
1385        let mut interner = StringInterner::new();
1386        interner.intern("gamma").unwrap();
1387        interner.intern("alpha").unwrap();
1388        interner.intern("beta").unwrap();
1389
1390        let bytes = postcard::to_stdvec(&interner).expect("serialize");
1391
1392        // Find the positions of the key strings in the serialized bytes
1393        let bytes_str = String::from_utf8_lossy(&bytes);
1394        // "alpha" should appear before "beta" before "gamma" in the lookup
1395        // portion (the second occurrence, since strings Vec has insertion order)
1396        let find_second = |needle: &str| {
1397            let first = bytes_str.find(needle).unwrap();
1398            bytes_str[first + needle.len()..]
1399                .find(needle)
1400                .map(|pos| pos + first + needle.len())
1401        };
1402
1403        // The lookup map is serialized after the strings Vec, so the second
1404        // occurrence of each string is in the sorted lookup portion
1405        let alpha_pos = find_second("alpha");
1406        let beta_pos = find_second("beta");
1407        let gamma_pos = find_second("gamma");
1408
1409        if let (Some(a), Some(b), Some(g)) = (alpha_pos, beta_pos, gamma_pos) {
1410            assert!(a < b, "alpha should appear before beta in sorted lookup");
1411            assert!(b < g, "beta should appear before gamma in sorted lookup");
1412        }
1413        // If any string doesn't appear twice, the test is vacuously true
1414        // (postcard may optimize storage), which is fine.
1415    }
1416
1417    // ---- lookup_stale invariant guard tests ----
1418
1419    #[test]
1420    fn test_alloc_range_sets_lookup_stale() {
1421        let mut interner = StringInterner::new();
1422        assert!(!interner.is_lookup_stale());
1423
1424        interner.alloc_range(5).unwrap();
1425        assert!(interner.is_lookup_stale());
1426    }
1427
1428    #[test]
1429    fn test_alloc_range_zero_does_not_set_stale() {
1430        let mut interner = StringInterner::new();
1431        interner.alloc_range(0).unwrap();
1432        // Zero allocation is a no-op — lookup remains consistent.
1433        assert!(!interner.is_lookup_stale());
1434    }
1435
1436    #[test]
1437    fn test_build_dedup_table_clears_lookup_stale() {
1438        let mut interner = StringInterner::new();
1439        let start = interner.alloc_range(2).unwrap();
1440        assert!(interner.is_lookup_stale());
1441
1442        // Write some strings into the bulk slots
1443        interner.strings[start as usize] = Some(Arc::from("alpha"));
1444        interner.ref_counts[start as usize] = 1;
1445        interner.strings[(start + 1) as usize] = Some(Arc::from("beta"));
1446        interner.ref_counts[(start + 1) as usize] = 1;
1447
1448        let _remap = interner.build_dedup_table();
1449        assert!(!interner.is_lookup_stale());
1450
1451        // Lookup should now be consistent
1452        assert!(interner.contains("alpha"));
1453        assert!(interner.contains("beta"));
1454        assert_eq!(interner.len(), 2);
1455    }
1456
1457    #[test]
1458    fn test_truncate_to_clears_lookup_stale() {
1459        let mut interner = StringInterner::new();
1460        let saved_len = interner.string_count_raw();
1461
1462        interner.alloc_range(10).unwrap();
1463        assert!(interner.is_lookup_stale());
1464
1465        interner.truncate_to(saved_len);
1466        assert!(!interner.is_lookup_stale());
1467
1468        // Should be usable again after rollback
1469        let id = interner.intern("after_rollback").unwrap();
1470        assert!(interner.contains("after_rollback"));
1471        assert_eq!(interner.resolve(id).unwrap().as_ref(), "after_rollback");
1472    }
1473
1474    #[test]
1475    #[should_panic(expected = "lookup is stale")]
1476    fn test_intern_panics_when_lookup_stale() {
1477        let mut interner = StringInterner::new();
1478        interner.alloc_range(1).unwrap();
1479        let _ = interner.intern("should_panic");
1480    }
1481
1482    #[test]
1483    #[should_panic(expected = "lookup is stale")]
1484    fn test_intern_without_ref_panics_when_lookup_stale() {
1485        let mut interner = StringInterner::new();
1486        interner.alloc_range(1).unwrap();
1487        let _ = interner.intern_without_ref("should_panic");
1488    }
1489
1490    #[test]
1491    #[should_panic(expected = "lookup is stale")]
1492    fn test_get_panics_when_lookup_stale() {
1493        let mut interner = StringInterner::new();
1494        interner.alloc_range(1).unwrap();
1495        let _ = interner.get("should_panic");
1496    }
1497
1498    #[test]
1499    #[should_panic(expected = "lookup is stale")]
1500    fn test_contains_panics_when_lookup_stale() {
1501        let mut interner = StringInterner::new();
1502        interner.alloc_range(1).unwrap();
1503        let _ = interner.contains("should_panic");
1504    }
1505
1506    #[test]
1507    #[should_panic(expected = "lookup is stale")]
1508    fn test_len_panics_when_lookup_stale() {
1509        let mut interner = StringInterner::new();
1510        interner.alloc_range(1).unwrap();
1511        let _ = interner.len();
1512    }
1513
1514    #[test]
1515    #[should_panic(expected = "lookup is stale")]
1516    fn test_is_empty_panics_when_lookup_stale() {
1517        let mut interner = StringInterner::new();
1518        interner.alloc_range(1).unwrap();
1519        let _ = interner.is_empty();
1520    }
1521
1522    #[test]
1523    #[should_panic(expected = "lookup is stale")]
1524    fn test_recycle_unreferenced_panics_when_lookup_stale() {
1525        let mut interner = StringInterner::new();
1526        interner.alloc_range(1).unwrap();
1527        let _ = interner.recycle_unreferenced();
1528    }
1529
1530    #[test]
1531    fn test_resolve_works_when_lookup_stale() {
1532        // resolve() does NOT use the lookup — it reads directly from slots.
1533        // It should work even when the lookup is stale.
1534        let mut interner = StringInterner::new();
1535        let id = interner.intern("before_bulk").unwrap();
1536
1537        interner.alloc_range(5).unwrap();
1538        assert!(interner.is_lookup_stale());
1539
1540        // resolve() should still work for pre-existing entries
1541        assert_eq!(interner.resolve(id).unwrap().as_ref(), "before_bulk");
1542    }
1543
1544    #[test]
1545    fn test_stats_works_when_lookup_stale() {
1546        // stats() should be safe to call even when lookup is stale.
1547        let mut interner = StringInterner::new();
1548        interner.intern("existing").unwrap();
1549
1550        let start = interner.alloc_range(2).unwrap();
1551        interner.strings[start as usize] = Some(Arc::from("bulk1"));
1552        interner.ref_counts[start as usize] = 1;
1553
1554        assert!(interner.is_lookup_stale());
1555
1556        let stats = interner.stats();
1557        // Should count occupied slots: "existing" + "bulk1" = 2
1558        // (the second bulk slot is still None)
1559        assert_eq!(stats.string_count, 2);
1560    }
1561
1562    #[test]
1563    fn test_display_works_when_lookup_stale() {
1564        let mut interner = StringInterner::new();
1565        interner.alloc_range(3).unwrap();
1566        assert!(interner.is_lookup_stale());
1567
1568        let display = format!("{interner}");
1569        assert!(
1570            display.contains("lookup_stale"),
1571            "Display should indicate stale state: {display}"
1572        );
1573    }
1574
1575    #[test]
1576    fn test_display_omits_stale_when_consistent() {
1577        let mut interner = StringInterner::new();
1578        interner.intern("hello").unwrap();
1579
1580        let display = format!("{interner}");
1581        assert!(
1582            !display.contains("lookup_stale"),
1583            "Display should not mention stale when consistent: {display}"
1584        );
1585    }
1586
1587    #[test]
1588    fn test_full_parallel_commit_lifecycle() {
1589        // Simulates the full Phase 2→3→4a lifecycle:
1590        // 1. Pre-allocate ranges (sets stale)
1591        // 2. Write strings into bulk slots (stale — only resolve works)
1592        // 3. build_dedup_table (clears stale, rebuilds lookup)
1593        // 4. All methods work again
1594        let mut interner = StringInterner::new();
1595
1596        // Pre-existing string (before parallel commit)
1597        let pre_id = interner.intern("pre_existing").unwrap();
1598        assert_eq!(interner.len(), 1);
1599
1600        // Phase 2: allocate ranges for 3 files × 2 strings each
1601        let start = interner.alloc_range(6).unwrap();
1602        assert!(interner.is_lookup_stale());
1603
1604        // Phase 3: write strings (simulating parallel workers)
1605        // File 0: "alpha", "beta"
1606        interner.strings[start as usize] = Some(Arc::from("alpha"));
1607        interner.ref_counts[start as usize] = 1;
1608        interner.strings[(start + 1) as usize] = Some(Arc::from("beta"));
1609        interner.ref_counts[(start + 1) as usize] = 1;
1610
1611        // File 1: "alpha" (duplicate), "gamma"
1612        interner.strings[(start + 2) as usize] = Some(Arc::from("alpha"));
1613        interner.ref_counts[(start + 2) as usize] = 1;
1614        interner.strings[(start + 3) as usize] = Some(Arc::from("gamma"));
1615        interner.ref_counts[(start + 3) as usize] = 1;
1616
1617        // File 2: "beta" (duplicate), "delta"
1618        interner.strings[(start + 4) as usize] = Some(Arc::from("beta"));
1619        interner.ref_counts[(start + 4) as usize] = 1;
1620        interner.strings[(start + 5) as usize] = Some(Arc::from("delta"));
1621        interner.ref_counts[(start + 5) as usize] = 1;
1622
1623        // resolve() works during stale state
1624        assert_eq!(interner.resolve(pre_id).unwrap().as_ref(), "pre_existing");
1625
1626        // Phase 4a: dedup rebuilds lookup
1627        let remap = interner.build_dedup_table();
1628        assert!(!interner.is_lookup_stale());
1629
1630        // 2 duplicates remapped: "alpha" at start+2, "beta" at start+4
1631        assert_eq!(remap.len(), 2);
1632
1633        // All lookup-dependent methods work
1634        assert_eq!(interner.len(), 5); // pre_existing + alpha + beta + gamma + delta
1635        assert!(interner.contains("pre_existing"));
1636        assert!(interner.contains("alpha"));
1637        assert!(interner.contains("beta"));
1638        assert!(interner.contains("gamma"));
1639        assert!(interner.contains("delta"));
1640        assert_eq!(interner.get("alpha"), Some(StringId::new(start)));
1641        assert_eq!(interner.get("beta"), Some(StringId::new(start + 1)));
1642
1643        // Canonical ref_counts should be accumulated
1644        assert_eq!(interner.ref_count(StringId::new(start)), 2); // alpha: 1+1
1645        assert_eq!(interner.ref_count(StringId::new(start + 1)), 2); // beta: 1+1
1646    }
1647
1648    #[test]
1649    fn test_clear_resets_lookup_stale() {
1650        // Regression: clear() must reset lookup_stale so that lookup-dependent
1651        // methods work on the now-empty (trivially consistent) interner.
1652        let mut interner = StringInterner::new();
1653        interner.alloc_range(5).unwrap();
1654        assert!(interner.is_lookup_stale());
1655
1656        interner.clear();
1657        assert!(!interner.is_lookup_stale());
1658
1659        // All lookup-dependent methods should work after clear
1660        assert_eq!(interner.len(), 0);
1661        assert!(interner.is_empty());
1662        assert!(!interner.contains("anything"));
1663        assert_eq!(interner.get("anything"), None);
1664
1665        // Interning should work after clear
1666        let id = interner.intern("after_clear").unwrap();
1667        assert_eq!(interner.resolve(id).unwrap().as_ref(), "after_clear");
1668        assert_eq!(interner.len(), 1);
1669    }
1670
1671    #[test]
1672    fn test_bulk_slices_mut_sets_lookup_stale() {
1673        // Regression: bulk_slices_mut() must defensively set lookup_stale
1674        // even if called independently of alloc_range().
1675        let mut interner = StringInterner::new();
1676
1677        // Manually grow vecs to simulate pre-allocation without alloc_range
1678        interner.strings.resize(4, None);
1679        interner.ref_counts.resize(4, 0);
1680        assert!(!interner.is_lookup_stale());
1681
1682        // bulk_slices_mut with count > 0 should set stale
1683        let (str_slots, rc_slots) = interner.bulk_slices_mut(1, 3);
1684        str_slots[0] = Some(Arc::from("x"));
1685        rc_slots[0] = 1;
1686        assert!(interner.is_lookup_stale());
1687
1688        // build_dedup_table clears it
1689        let _remap = interner.build_dedup_table();
1690        assert!(!interner.is_lookup_stale());
1691        assert!(interner.contains("x"));
1692    }
1693
1694    #[test]
1695    fn test_bulk_slices_mut_zero_does_not_set_stale() {
1696        let mut interner = StringInterner::new();
1697        interner.intern("existing").unwrap();
1698        assert!(!interner.is_lookup_stale());
1699
1700        // Zero-length bulk_slices_mut is a no-op — should not set stale
1701        let (_s, _r) = interner.bulk_slices_mut(1, 0);
1702        assert!(!interner.is_lookup_stale());
1703
1704        // Lookup should still work
1705        assert!(interner.contains("existing"));
1706    }
1707
1708    #[test]
1709    fn test_with_max_ids_capacity_exhaustion_via_intern() {
1710        // with_max_ids limits the number of unique interned strings.
1711        let mut interner = StringInterner::with_max_ids(2);
1712
1713        interner.intern("a").unwrap();
1714        interner.intern("b").unwrap();
1715
1716        // Third unique string should hit the limit
1717        let err = interner.intern("c").unwrap_err();
1718        assert_eq!(err, InternError::CapacityExhausted);
1719
1720        // Re-interning existing string (already in lookup) still works — no new slot
1721        let id = interner.intern("a").unwrap();
1722        assert!(interner.resolve(id).is_some());
1723    }
1724
1725    #[test]
1726    fn test_with_max_ids_capacity_exhaustion_via_intern_without_ref() {
1727        let mut interner = StringInterner::with_max_ids(1);
1728        interner.intern_without_ref("only").unwrap();
1729
1730        // Second unique string should hit the limit
1731        let err = interner.intern_without_ref("second").unwrap_err();
1732        assert_eq!(err, InternError::CapacityExhausted);
1733    }
1734
1735    #[test]
1736    fn test_intern_without_ref_already_interned_no_ref_count_change() {
1737        let mut interner = StringInterner::new();
1738        let id = interner.intern("hello").unwrap();
1739        assert_eq!(interner.ref_count(id), 1);
1740
1741        // intern_without_ref for an already-interned string returns the
1742        // existing ID but does NOT increment the ref count.
1743        let id2 = interner.intern_without_ref("hello").unwrap();
1744        assert_eq!(id, id2);
1745        assert_eq!(interner.ref_count(id), 1); // unchanged
1746    }
1747
1748    #[test]
1749    fn test_recycle_unreferenced_noop_when_all_referenced() {
1750        let mut interner = StringInterner::new();
1751        interner.intern("a").unwrap(); // ref_count = 1
1752        interner.intern("b").unwrap(); // ref_count = 1
1753
1754        let recycled = interner.recycle_unreferenced();
1755        assert_eq!(recycled, 0); // nothing to recycle
1756        assert_eq!(interner.len(), 2); // both strings still present
1757    }
1758
1759    #[test]
1760    fn test_inc_ref_out_of_bounds_returns_none() {
1761        let mut interner = StringInterner::new();
1762        // index 999 is beyond the allocated range
1763        let result = interner.inc_ref(StringId::new(999));
1764        assert!(result.is_none());
1765    }
1766
1767    #[test]
1768    fn test_dec_ref_out_of_bounds_returns_none() {
1769        let mut interner = StringInterner::new();
1770        // index 999 is beyond the allocated range
1771        let result = interner.dec_ref(StringId::new(999));
1772        assert!(result.is_none());
1773    }
1774
1775    #[test]
1776    fn test_intern_without_ref_reuses_free_list_slot() {
1777        let mut interner = StringInterner::new();
1778        // Intern and then recycle to put a slot on the free list
1779        interner.intern_without_ref("recycle_me").unwrap(); // ref_count = 0
1780        interner.intern("anchor").unwrap(); // ref_count = 1, keeps interner non-empty
1781        let recycled = interner.recycle_unreferenced();
1782        assert_eq!(recycled, 1); // "recycle_me" freed
1783
1784        // intern_without_ref for a new string should reuse the freed slot
1785        let new_id = interner.intern_without_ref("fresh").unwrap();
1786        assert_eq!(interner.resolve(new_id).unwrap().as_ref(), "fresh");
1787        assert_eq!(interner.ref_count(new_id), 0);
1788    }
1789}