qbice_storage/
intern.rs

1//! A thread-safe interning system for deduplicating values based on stable
2//! hashing.
3//!
4//! Interning is a memory optimization technique where equal values are stored
5//! only once in memory, with all references pointing to the same allocation.
6//! This module provides a concurrent interner using stable hashing to identify
7//! equal values deterministically across program executions.
8//!
9//! # Key Components
10//!
11//! - [`InternedID`]: A unique identifier combining type ID and content hash
12//! - [`Interned<T>`]: A reference-counted handle providing transparent value
13//!   access
14//! - [`Interner`]: The core deduplication engine with sharded concurrent access
15//! - [`SharedInterner`]: An `Arc`-wrapped interner for easy sharing
16//!
17//! # Benefits of Interning
18//!
19//! ## Memory Savings
20//!
21//! When you have many duplicate values, interning can dramatically reduce
22//! memory usage:
23//! - 1000 copies of "hello" → 1 allocation + 1000 small handles
24//! - Large duplicate structures → single allocation shared by all references
25//!
26//! ## Unsized Type Support
27//!
28//! The interner supports interning unsized types like `str` and `[T]` through
29//! the [`Interner::intern_unsized`] method. This is especially useful for
30//! string interning where you want to store `Interned<str>` rather than
31//! `Interned<String>`, saving the extra indirection of the `String` wrapper:
32//!
33//! ```ignore
34//! // Intern a str slice directly
35//! let s1: Interned<str> = interner.intern_unsized("hello world".to_string());
36//! let s2: Interned<str> = interner.intern_unsized("hello world".to_string());
37//!
38//! // Same allocation
39//! assert!(Arc::ptr_eq(&s1.0, &s2.0));
40//!
41//! // Can also use Box<str> or any type that converts to Arc<str>
42//! let s3: Interned<str> = interner.intern_unsized(Box::<str>::from("hello world"));
43//! ```
44//!
45//! ## Serialization Optimization
46//!
47//! The interning system integrates with the serialization framework:
48//! - First occurrence: full value serialized
49//! - Subsequent occurrences: only hash reference serialized
50//! - Automatic deduplication during deserialization
51//!
52//! ## Cross-Execution Stability
53//!
54//! Using stable hashing ensures:
55//! - Same values produce same hashes across program runs
56//! - Serialized data can reference values by hash
57//! - Deterministic behavior for testing and debugging
58//!
59//! # When to Use Interning
60//!
61//! **Use interning when:**
62//! - Many duplicate immutable values exist (strings, configs, AST nodes)
63//! - Values are compared frequently by identity
64//! - Serialization size matters (network protocols, caching)
65//! - Memory usage is more important than allocation speed
66//!
67//! **Use regular `Arc`/`Rc` when:**
68//! - Values are rarely duplicated
69//! - Cross-execution stability isn't needed
70//! - Allocation performance is critical (hashing overhead matters)
71//! - Values are mutable
72//!
73//! # Architecture
74//!
75//! The interner uses:
76//! - **Sharding**: Multiple independent hash tables to reduce lock contention
77//! - **Weak references**: Automatic cleanup when values are no longer used
78//! - **Stable hashing**: Deterministic 128-bit hashes for cross-run stability
79//! - **Type safety**: Type IDs prevent hash collisions between different types
80//!
81//! # Example
82//!
83//! ```ignore
84//! use qbice_storage::intern::Interner;
85//! use qbice_stable_hash::BuildStableHasher128;
86//!
87//! // Create an interner
88//! let interner = Interner::new(16, BuildStableHasher128::default());
89//!
90//! // Intern some values
91//! let s1 = interner.intern("hello world".to_string());
92//! let s2 = interner.intern("hello world".to_string());
93//!
94//! // Same allocation: s1 and s2 point to the exact same memory
95//! assert!(Arc::ptr_eq(&s1.0, &s2.0));
96//!
97//! // Transparent access via Deref
98//! assert_eq!(&*s1, "hello world");
99//!
100//! // When all Interned handles are dropped, the value is freed
101//! drop(s1);
102//! drop(s2);
103//! // Memory is now reclaimed
104//! ```
105//!
106//! # Serialization Integration
107//!
108//! ```ignore
109//! use qbice_serialize::Plugin;
110//!
111//! // Add interner to serialization plugin
112//! let mut plugin = Plugin::new();
113//! plugin.insert(interner.clone());
114//!
115//! // Encode data with automatic deduplication
116//! let data = vec![s1.clone(), s1.clone(), s2.clone()];
117//! // s1's value is serialized once, then referenced by hash
118//! encoder.encode(&data, &plugin, &mut session)?;
119//! ```
120
121use std::{
122    any::Any,
123    borrow::Borrow,
124    collections::{HashMap, hash_map::Entry},
125    hash::Hash,
126    ops::Deref,
127    path::Path,
128    sync::{Arc, Weak},
129    thread::JoinHandle,
130    time::Duration,
131};
132
133use crossbeam_channel::{Receiver, Sender, unbounded};
134use fxhash::{FxBuildHasher, FxHashSet};
135use parking_lot::{MappedRwLockReadGuard, RwLockReadGuard};
136use qbice_serialize::{
137    Decode, Decoder, Encode, Encoder, Plugin,
138    session::{Session, SessionKey},
139};
140use qbice_stable_hash::{
141    BuildStableHasher, Compact128, StableHash, StableHasher,
142};
143use qbice_stable_type_id::{Identifiable, StableTypeID};
144
145use crate::sharded::Sharded;
146
147/// A globally unique identifier for an interned value.
148///
149/// `InternedID` combines two components to create a collision-resistant
150/// identifier:
151/// 1. **Type ID** ([`StableTypeID`]): Uniquely identifies the Rust type
152/// 2. **Content Hash** ([`Compact128`]): 128-bit hash of the value's content
153///
154/// This dual-component design prevents type confusion: values of different
155/// types with the same content hash will still have different `InternedID`s.
156///
157/// # Structure
158///
159/// ```text
160/// InternedID {
161///     stable_type_id: 0x1234_5678_9abc_def0,  // Type's stable ID
162///     hash_128: 0xfedC_BA98_7654_3210_...,    // 128-bit content hash
163/// }
164/// ```
165///
166/// # Example Scenario
167///
168/// ```ignore
169/// // These would have the same content hash but different type IDs
170/// let string_42 = interner.intern("42".to_string());
171/// let int_42 = interner.intern(42_i32);
172///
173/// // Different InternedIDs prevent type confusion:
174/// // - string_42's ID: (String type ID, hash("42"))
175/// // - int_42's ID: (i32 type ID, hash(42))
176/// ```
177///
178/// # Hash Collisions
179///
180/// While 128-bit hashes make collisions astronomically unlikely, the
181/// combination with type IDs provides an additional layer of protection.
182/// Different types cannot collide even if their content hashes match.
183///
184/// # Serialization
185///
186/// When serializing [`Interned<T>`], subsequent occurrences of the same value
187/// are encoded as references using this ID, significantly reducing serialized
188/// size for duplicate data.
189#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
190pub struct InternedID {
191    stable_type_id: StableTypeID,
192    hash_128: Compact128,
193}
194
195/// A reference-counted handle to an interned value with transparent access.
196///
197/// `Interned<T>` wraps an `Arc<T>` and provides seamless access to the
198/// underlying value through the [`Deref`] trait. When multiple `Interned<T>`
199/// instances contain equal values (as determined by stable hash), they share
200/// the same underlying allocation when created through the same [`Interner`].
201///
202/// # Unsized Type Support
203///
204/// `Interned<T>` supports unsized types like `str` and `[T]`. Use
205/// [`Interner::intern_unsized`] to create `Interned<str>` or `Interned<[T]>`
206/// values. This avoids the extra layer of indirection from wrapper types like
207/// `String` or `Vec<T>`.
208///
209/// # Memory Management
210///
211/// - **Automatic deallocation**: The value is freed when all `Interned<T>`
212///   handles are dropped
213/// - **Weak references**: The interner only holds weak references, allowing
214///   garbage collection of unused values
215/// - **No memory leaks**: Values without active handles are automatically
216///   reclaimed
217///
218/// # Cloning
219///
220/// Cloning is extremely cheap (O(1)) as it only increments the reference count:
221/// ```ignore
222/// let s1 = interner.intern("data".to_string());
223/// let s2 = s1.clone();  // Fast: just increments refcount
224/// ```
225///
226/// # Comparison with `Arc<T>`
227///
228/// | Aspect | `Arc<T>` | `Interned<T>` |
229/// |--------|----------|---------------|
230/// | Deduplication | No (each `Arc::new` creates new allocation) | Yes (equal values share allocation) |
231/// | Overhead | Reference count only | Hash computation + lookup |
232/// | Memory usage | Higher for duplicates | Lower for duplicates |
233/// | Creation speed | Faster | Slower (requires hashing) |
234/// | Cross-execution stability | No | Yes (with stable hashing) |
235///
236/// # Serialization
237///
238/// When using [`Encode`]/[`Decode`] traits, `Interned<T>` automatically
239/// deduplicates values:
240/// - **First occurrence**: Serialized in full
241/// - **Subsequent occurrences**: Serialized as hash reference only
242///
243/// This dramatically reduces serialized size when many duplicate values exist.
244///
245/// # Example
246///
247/// ```ignore
248/// // Create interned values
249/// let s1 = interner.intern("hello".to_string());
250/// let s2 = interner.intern("hello".to_string());
251///
252/// // They share the same allocation
253/// assert!(Arc::ptr_eq(&s1.0, &s2.0));
254///
255/// // Transparent access
256/// assert_eq!(&*s1, "hello");
257/// assert_eq!(s1.len(), 5);
258///
259/// // Get owned copy if needed
260/// let owned: String = s1.clone_inner();
261/// ```
262#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, StableHash)]
263#[stable_hash_crate(qbice_stable_hash)]
264pub struct Interned<T: ?Sized>(Arc<T>);
265
266impl<T: ?Sized> Clone for Interned<T> {
267    fn clone(&self) -> Self { Self(self.0.clone()) }
268}
269
270impl<T> Interned<T> {
271    /// Returns an owned clone of the interned value.
272    #[must_use]
273    pub fn clone_inner(&self) -> T
274    where
275        T: Clone,
276    {
277        self.0.as_ref().clone()
278    }
279}
280
281impl<T: ?Sized> Deref for Interned<T> {
282    type Target = T;
283
284    fn deref(&self) -> &Self::Target { &self.0 }
285}
286
287impl<T: ?Sized> AsRef<T> for Interned<T> {
288    fn as_ref(&self) -> &T { &self.0 }
289}
290
291impl<T: ?Sized> Borrow<T> for Interned<T> {
292    fn borrow(&self) -> &T { &self.0 }
293}
294
295enum WiredInterned<T> {
296    Source(T),
297    Reference(Compact128),
298}
299
300impl<T: Encode> Encode for WiredInterned<T> {
301    fn encode<E: Encoder + ?Sized>(
302        &self,
303        encoder: &mut E,
304        plugin: &Plugin,
305        session: &mut Session,
306    ) -> std::io::Result<()> {
307        match self {
308            Self::Source(value) => {
309                encoder.emit_u8(0)?;
310                value.encode(encoder, plugin, session)
311            }
312            Self::Reference(hash) => {
313                encoder.emit_u8(1)?;
314                hash.encode(encoder, plugin, session)
315            }
316        }
317    }
318}
319
320impl<T: Decode> Decode for WiredInterned<T> {
321    fn decode<D: Decoder + ?Sized>(
322        decoder: &mut D,
323        plugin: &Plugin,
324        session: &mut Session,
325    ) -> std::io::Result<Self> {
326        let tag = decoder.read_u8()?;
327        match tag {
328            0 => {
329                let value = T::decode(decoder, plugin, session)?;
330                Ok(Self::Source(value))
331            }
332            1 => {
333                let hash = Compact128::decode(decoder, plugin, session)?;
334                Ok(Self::Reference(hash))
335            }
336
337            _ => Err(std::io::Error::new(
338                std::io::ErrorKind::InvalidData,
339                "invalid tag for WiredInterned",
340            )),
341        }
342    }
343}
344
345impl Decode for Interned<str> {
346    fn decode<D: Decoder + ?Sized>(
347        decoder: &mut D,
348        plugin: &Plugin,
349        session: &mut Session,
350    ) -> std::io::Result<Self> {
351        let wired =
352            WiredInterned::<Box<str>>::decode(decoder, plugin, session)?;
353
354        let interner = plugin.get::<Interner>().expect(
355            "`SharedInterner` plugin missing for decoding `Interned<str>`",
356        );
357
358        let value = match wired {
359            WiredInterned::Source(source) => interner.intern_unsized(source),
360
361            WiredInterned::Reference(compact128) => interner
362                .get_from_hash::<str>(compact128)
363                .expect("referenced interned value not found in interner"),
364        };
365
366        Ok(value)
367    }
368}
369
370impl<T: Decode + StableHash + Identifiable + Send + Sync + 'static> Decode
371    for Interned<[T]>
372{
373    fn decode<D: Decoder + ?Sized>(
374        decoder: &mut D,
375        plugin: &Plugin,
376        session: &mut Session,
377    ) -> std::io::Result<Self> {
378        let wired =
379            WiredInterned::<Box<[T]>>::decode(decoder, plugin, session)?;
380
381        let interner = plugin.get::<Interner>().expect(
382            "`SharedInterner` plugin missing for decoding `Interned<[T]>`",
383        );
384
385        let value = match wired {
386            WiredInterned::Source(source) => interner.intern_unsized(source),
387
388            WiredInterned::Reference(compact128) => interner
389                .get_from_hash::<[T]>(compact128)
390                .expect("referenced interned value not found in interner"),
391        };
392
393        Ok(value)
394    }
395}
396
397impl Decode for Interned<Path> {
398    fn decode<D: Decoder + ?Sized>(
399        decoder: &mut D,
400        plugin: &Plugin,
401        session: &mut Session,
402    ) -> std::io::Result<Self> {
403        let wired =
404            WiredInterned::<Box<Path>>::decode(decoder, plugin, session)?;
405
406        let interner = plugin.get::<Interner>().expect(
407            "`SharedInterner` plugin missing for decoding `Interned<Path>`",
408        );
409
410        let value = match wired {
411            WiredInterned::Source(source) => interner.intern_unsized(source),
412
413            WiredInterned::Reference(compact128) => interner
414                .get_from_hash::<Path>(compact128)
415                .expect("referenced interned value not found in interner"),
416        };
417
418        Ok(value)
419    }
420}
421
422/// A session key for tracking seen interned IDs during encoding.
423struct SeenInterned;
424
425impl SessionKey for SeenInterned {
426    type Value = FxHashSet<InternedID>;
427}
428
429impl<T: Identifiable + StableHash + Encode + Send + Sync + 'static + ?Sized>
430    Encode for Interned<T>
431{
432    fn encode<E: Encoder + ?Sized>(
433        &self,
434        encoder: &mut E,
435        plugin: &Plugin,
436        session: &mut Session,
437    ) -> std::io::Result<()> {
438        let interner = plugin.get::<Interner>().expect(
439            "`SharedInterner` plugin missing for encoding `Interned<T>`",
440        );
441
442        let value: &T = &self.0;
443        let compact_128 = interner.hash_128(value);
444
445        let seen_interned = session.get_mut_or_default::<SeenInterned>();
446        let first = seen_interned.insert(InternedID {
447            stable_type_id: T::STABLE_TYPE_ID,
448            hash_128: compact_128,
449        });
450
451        if first {
452            // serialize the full value
453            encoder.emit_u8(0)?;
454            value.encode(encoder, plugin, session)
455        } else {
456            // serialize only the reference
457            encoder.emit_u8(1)?;
458            compact_128.encode(encoder, plugin, session)
459        }
460    }
461}
462
463impl<T: Identifiable + StableHash + Decode + Send + Sync + 'static> Decode
464    for Interned<T>
465{
466    fn decode<D: Decoder + ?Sized>(
467        decoder: &mut D,
468        plugin: &Plugin,
469        session: &mut Session,
470    ) -> std::io::Result<Self> {
471        let wired = WiredInterned::<T>::decode(decoder, plugin, session)?;
472        let interner = plugin.get::<Interner>().expect(
473            "`SharedInterner` plugin missing for decoding `Interned<T>`",
474        );
475
476        let value = match wired {
477            WiredInterned::Source(source) => interner.intern(source),
478
479            WiredInterned::Reference(compact128) => interner
480                .get_from_hash::<T>(compact128)
481                .expect("referenced interned value not found in interner"),
482        };
483
484        Ok(value)
485    }
486}
487
488fn stable_hash<H: BuildStableHasher + 'static>(
489    build_hasher: &dyn Any,
490    value: &dyn DynStableHash,
491) -> u128
492where
493    <H as BuildStableHasher>::Hasher: StableHasher<Hash = u128>,
494{
495    let build_hasher =
496        build_hasher.downcast_ref::<H>().expect("invalid hasher type");
497
498    let mut hasher = build_hasher.build_stable_hasher();
499    value.stable_hash_dyn(&mut hasher);
500    hasher.finish()
501}
502
503trait DynStableHash {
504    fn stable_hash_dyn(&self, hasher: &mut dyn StableHasher<Hash = u128>);
505}
506
507impl<T: StableHash + ?Sized> DynStableHash for T {
508    fn stable_hash_dyn(&self, hasher: &mut dyn StableHasher<Hash = u128>) {
509        StableHash::stable_hash(self, hasher);
510    }
511}
512
513struct Shard {
514    typed_shard: Box<dyn Any + Send + Sync>,
515    vacuum_fn: fn(&dyn Any),
516}
517
518type TypedShard<T> = Sharded<HashMap<Compact128, Weak<T>, FxBuildHasher>>;
519type WholeShard = Sharded<HashMap<StableTypeID, Shard, FxBuildHasher>>;
520
521struct Repr {
522    shards: WholeShard,
523
524    hasher_builder_erased: Box<dyn Any + Send + Sync>,
525    stable_hash_fn: fn(&dyn Any, &dyn DynStableHash) -> u128,
526
527    /// Sender to signal the vacuum thread.
528    vacuum_command: Option<Sender<VacuumCommand>>,
529}
530
531/// A thread-safe interner for deduplicating values based on stable hashing.
532///
533/// The `Interner` uses sharding to enable high-concurrency access from multiple
534/// threads with minimal contention. Values are identified by their 128-bit
535/// stable hash combined with type ID, ensuring safe and deterministic
536/// deduplication.
537///
538/// # How It Works
539///
540/// 1. **Hash**: When interning a value, compute its stable 128-bit hash
541/// 2. **Lookup**: Check if a value with that hash already exists in the
542///    appropriate shard
543/// 3. **Reuse or Store**:
544///    - If found and alive: return handle to existing value
545///    - Otherwise: store new value and return handle to it
546///
547/// # Sharding Architecture
548///
549/// ```text
550/// Interner
551///   ├─ Shard 0: HashMap<InternedID, Weak<T>>
552///   ├─ Shard 1: HashMap<InternedID, Weak<T>>
553///   ├─ ...
554///   └─ Shard N: HashMap<InternedID, Weak<T>>
555/// ```
556///
557/// Each shard:
558/// - Has its own lock (reducing contention)
559/// - Handles a subset of hash values
560/// - Can be accessed in parallel with other shards
561///
562/// # Weak References and Garbage Collection
563///
564/// The interner stores [`Weak`] references, not strong references:
565/// - Values are kept alive only by [`Interned<T>`] handles
566/// - When all handles are dropped, the value is deallocated
567/// - Subsequent intern calls for the same value create a new allocation
568/// - Dead weak references can be cleaned up via [`vacuum`](Self::vacuum) or
569///   automatically by a background vacuum thread
570///
571/// This prevents memory leaks from long-lived interners accumulating values.
572///
573/// # Background Vacuum Thread
574///
575/// The interner can optionally run a background thread that periodically
576/// removes dead weak references from the shards. Use
577/// [`new_with_vacuum`](Self::new_with_vacuum) to create an interner with
578/// automatic cleanup:
579///
580/// ```ignore
581/// use std::time::Duration;
582///
583/// // Vacuum every 60 seconds
584/// let interner = Interner::new_with_vacuum(
585///     16,
586///     BuildStableHasher128::default(),
587///     Duration::from_secs(60),
588/// );
589///
590/// // Or trigger vacuum manually
591/// interner.vacuum();
592/// ```
593///
594/// # Thread Safety
595///
596/// Fully thread-safe and can be shared using `.clone()` directly. Multiple
597/// threads can safely call [`intern`](Self::intern) concurrently. Sharding
598/// ensures that operations on different hash buckets don't block each other.
599///
600/// # Performance Characteristics
601///
602/// - **Intern**: O(1) expected, requires hashing entire value
603/// - **Lookup**: O(1) expected, requires hashing entire value
604/// - **Memory**: O(unique values + shard overhead)
605/// - **Concurrency**: Excellent when shards ≥ typical concurrent threads
606///
607/// # Example
608///
609/// ```ignore
610/// use qbice_stable_hash::BuildStableHasher128;
611///
612/// let interner = Interner::new(16, BuildStableHasher128::default());
613///
614/// // Intern values
615/// let v1 = interner.intern("data".to_string());
616/// let v2 = interner.intern("data".to_string());
617///
618/// // Same allocation
619/// assert!(Arc::ptr_eq(&v1.0, &v2.0));
620///
621/// // When all handles drop, value is freed
622/// drop(v1);
623/// drop(v2);
624/// // "data" is now deallocated
625/// ```
626#[derive(Debug, Clone)]
627pub struct Interner {
628    inner: Arc<Repr>,
629    /// Handle to the vacuum thread (if any). Kept for lifetime management.
630    _vacuum_thread: Option<Arc<VacuumThreadHandle>>,
631}
632
633/// Command sent to the vacuum thread.
634enum VacuumCommand {
635    /// Trigger an immediate vacuum cycle.
636    Trigger,
637    /// Stop the vacuum thread.
638    Stop,
639}
640
641/// Handle to manage the vacuum thread's lifecycle.
642struct VacuumThreadHandle {
643    command_sender: Sender<VacuumCommand>,
644    join_handle: Option<JoinHandle<()>>,
645}
646
647impl std::fmt::Debug for VacuumThreadHandle {
648    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
649        f.debug_struct("VacuumThreadHandle").finish_non_exhaustive()
650    }
651}
652
653impl Drop for VacuumThreadHandle {
654    fn drop(&mut self) {
655        // Signal the thread to stop
656        let _ = self.command_sender.send(VacuumCommand::Stop);
657
658        // Wait for the thread to finish
659        if let Some(handle) = self.join_handle.take() {
660            let _ = handle.join();
661        }
662    }
663}
664
665impl Interner {
666    /// Creates a new shared interner with the specified sharding and hasher.
667    ///
668    /// This creates an interner without a background vacuum thread. Dead weak
669    /// references will accumulate until you call [`vacuum`](Self::vacuum)
670    /// manually or until they are replaced by new intern calls.
671    ///
672    /// For automatic cleanup, use [`new_with_vacuum`](Self::new_with_vacuum).
673    ///
674    /// # Parameters
675    ///
676    /// - `shard_amount`: The number of shards for concurrent access. More
677    ///   shards reduce contention but increase memory overhead. Recommended:
678    ///   16-32 for multi-threaded use, or the number of CPU cores.
679    /// - `hasher_builder`: The builder for creating stable hashers used to hash
680    ///   interned values. Must produce deterministic 128-bit hashes.
681    ///
682    /// # Example
683    ///
684    /// ```ignore
685    /// use qbice_stable_hash::BuildStableHasher128;
686    ///
687    /// let interner = Interner::new(16, BuildStableHasher128::default());
688    /// ```
689    pub fn new<S: BuildStableHasher<Hash = u128> + Send + Sync + 'static>(
690        shard_amount: usize,
691        hasher_builder: S,
692    ) -> Self {
693        Self {
694            inner: Arc::new(Repr::new(shard_amount, hasher_builder)),
695            _vacuum_thread: None,
696        }
697    }
698
699    /// Creates a new shared interner with a background vacuum thread.
700    ///
701    /// The vacuum thread periodically removes dead weak references from the
702    /// interner's shards, freeing up memory from entries that are no longer
703    /// referenced.
704    ///
705    /// # Parameters
706    ///
707    /// - `shard_amount`: The number of shards for concurrent access. More
708    ///   shards reduce contention but increase memory overhead. Recommended:
709    ///   16-32 for multi-threaded use, or the number of CPU cores.
710    /// - `hasher_builder`: The builder for creating stable hashers used to hash
711    ///   interned values. Must produce deterministic 128-bit hashes.
712    /// - `vacuum_interval`: The duration between automatic vacuum runs. The
713    ///   vacuum thread will sleep for this duration between cleanup cycles.
714    ///
715    /// # Thread Behavior
716    ///
717    /// - The vacuum thread is automatically stopped when the interner is
718    ///   dropped
719    /// - You can still call [`vacuum`](Self::vacuum) manually for immediate
720    ///   cleanup
721    /// - The vacuum process acquires write locks on shards, so very frequent
722    ///   vacuuming may impact performance
723    ///
724    /// # Recommended Intervals
725    ///
726    /// - Low memory pressure: 5-10 minutes
727    /// - Moderate usage: 30-60 seconds
728    /// - High churn (many short-lived values): 5-15 seconds
729    ///
730    /// # Example
731    ///
732    /// ```ignore
733    /// use std::time::Duration;
734    /// use qbice_stable_hash::BuildStableHasher128;
735    ///
736    /// // Create interner with vacuum every 30 seconds
737    /// let interner = Interner::new_with_vacuum(
738    ///     16,
739    ///     BuildStableHasher128::default(),
740    ///     Duration::from_secs(30),
741    /// );
742    /// ```
743    pub fn new_with_vacuum<
744        S: BuildStableHasher<Hash = u128> + Send + Sync + 'static,
745    >(
746        shard_amount: usize,
747        hasher_builder: S,
748        vacuum_interval: Duration,
749    ) -> Self {
750        let (command_tx, command_rx) = unbounded::<VacuumCommand>();
751
752        let repr = Arc::new(Repr::new_with_vacuum(
753            shard_amount,
754            hasher_builder,
755            command_tx.clone(),
756        ));
757
758        let repr_weak = Arc::downgrade(&repr);
759
760        let join_handle = std::thread::Builder::new()
761            .name("interner-vacuum".to_string())
762            .spawn(move || {
763                vacuum_thread_loop(&repr_weak, &command_rx, vacuum_interval);
764            })
765            .expect("failed to spawn vacuum thread");
766
767        Self {
768            inner: repr,
769            _vacuum_thread: Some(Arc::new(VacuumThreadHandle {
770                command_sender: command_tx,
771                join_handle: Some(join_handle),
772            })),
773        }
774    }
775
776    /// Manually triggers a vacuum operation to clean up dead weak references.
777    ///
778    /// This method removes all dead [`Weak`] references from the interner's
779    /// internal hash maps. Dead references occur when all [`Interned<T>`]
780    /// handles for a value have been dropped.
781    ///
782    /// # When to Use
783    ///
784    /// - After a large batch of interned values has been dropped
785    /// - Before serializing the interner state (to reduce size)
786    /// - In memory-constrained environments
787    /// - When not using a background vacuum thread
788    ///
789    /// # Performance
790    ///
791    /// - Acquires write locks on all shards sequentially
792    /// - Time complexity: O(total entries across all shards)
793    /// - May briefly block other intern/lookup operations
794    ///
795    /// # Example
796    ///
797    /// ```ignore
798    /// // Intern many temporary values
799    /// for i in 0..10000 {
800    ///     let _ = interner.intern(format!("temp_{}", i));
801    /// }
802    /// // All Interned handles dropped, but weak refs remain
803    ///
804    /// // Clean up dead references
805    /// interner.vacuum();
806    /// ```
807    pub fn vacuum(&self) { vacuum_shards(&self.inner.shards); }
808
809    /// Requests the background vacuum thread to run immediately.
810    ///
811    /// If the interner was created with
812    /// [`new_with_vacuum`](Self::new_with_vacuum), this signals the
813    /// background thread to perform a vacuum cycle as soon as
814    /// possible, rather than waiting for the next scheduled interval.
815    ///
816    /// If no background vacuum thread exists (interner created with
817    /// [`new`](Self::new)), this method does nothing.
818    ///
819    /// # Non-blocking
820    ///
821    /// This method returns immediately after signaling the thread. It does not
822    /// wait for the vacuum operation to complete. If you need synchronous
823    /// cleanup, use [`vacuum`](Self::vacuum) instead.
824    ///
825    /// # Example
826    ///
827    /// ```ignore
828    /// // Request early vacuum (non-blocking)
829    /// interner.request_vacuum();
830    ///
831    /// // Or use vacuum() for synchronous cleanup
832    /// interner.vacuum();
833    /// ```
834    pub fn request_vacuum(&self) {
835        if let Some(sender) = &self.inner.vacuum_command {
836            // Send trigger command; unbounded channel won't block
837            let _ = sender.send(VacuumCommand::Trigger);
838        }
839    }
840
841    /// Computes the 128-bit stable hash of a value.
842    ///
843    /// This method uses the interner's configured hasher to compute a
844    /// deterministic hash. The same value will always produce the same hash
845    /// within this interner instance (and across instances using the same
846    /// hasher configuration).
847    ///
848    /// # Parameters
849    ///
850    /// * `value` - A reference to the value to hash. Must implement
851    ///   [`StableHash`].
852    ///
853    /// # Returns
854    ///
855    /// A [`Compact128`] representing the 128-bit stable hash of the value.
856    ///
857    /// # Use Cases
858    ///
859    /// This method is primarily used internally but can be useful for:
860    /// - Computing hashes for caching or indexing
861    /// - Debugging hash collisions
862    /// - Implementing custom interning strategies
863    ///
864    /// # Example
865    ///
866    /// ```ignore
867    /// let hash1 = interner.hash_128(&"hello".to_string());
868    /// let hash2 = interner.hash_128(&"hello".to_string());
869    /// assert_eq!(hash1, hash2);  // Same value, same hash
870    ///
871    /// let hash3 = interner.hash_128(&"world".to_string());
872    /// assert_ne!(hash1, hash3);  // Different value, different hash
873    /// ```
874    pub fn hash_128<T: StableHash + ?Sized>(&self, value: &T) -> Compact128 {
875        let hash_u128 = (self.inner.stable_hash_fn)(
876            &*self.inner.hasher_builder_erased,
877            &value,
878        );
879
880        hash_u128.into()
881    }
882
883    fn obtain_read_shard<T: Identifiable + Send + Sync + 'static + ?Sized>(
884        &self,
885    ) -> MappedRwLockReadGuard<'_, Shard> {
886        let shard_amount = self.inner.shards.shard_amount();
887        let stable_type_id = T::STABLE_TYPE_ID;
888
889        loop {
890            let shard_index =
891                self.inner.shards.shard_index(stable_type_id.low());
892            let shard = self.inner.shards.read_shard(shard_index);
893
894            // fast path, obtain read lock if shard exists
895            if let Ok(exist) =
896                RwLockReadGuard::try_map(shard, |x| x.get(&stable_type_id))
897            {
898                return exist;
899            }
900
901            // slow path, need to create the shard, the `shard` has already
902            // dropped here
903            let mut write_shard = self.inner.shards.write_shard(shard_index);
904
905            if let Entry::Vacant(entry) = write_shard.entry(stable_type_id) {
906                entry.insert(Shard {
907                    typed_shard: {
908                        Box::new(TypedShard::<T>::new(shard_amount, |_| {
909                            HashMap::default()
910                        }))
911                    },
912                    vacuum_fn: vacuum_shard::<T>,
913                });
914            }
915        }
916    }
917
918    /// Retrieves an interned value by its 128-bit hash, if it exists and is
919    /// alive.
920    ///
921    /// This method looks up a value that was previously interned and still has
922    /// at least one active [`Interned<T>`] handle. It's primarily used during
923    /// deserialization to resolve hash references back to actual values.
924    ///
925    /// # Parameters
926    ///
927    /// * `hash_128` - The 128-bit stable hash of the value to retrieve.
928    ///
929    /// # Type Parameter
930    ///
931    /// * `T` - The expected type of the value. Must match the type that was
932    ///   originally interned.
933    ///
934    /// # Returns
935    ///
936    /// * `Some(Interned<T>)` - If a value with this hash exists, is still alive
937    ///   (has active handles), and has the correct type
938    /// * `None` - If:
939    ///   - No value with this hash exists
940    ///   - The value has been deallocated (no active handles)
941    ///   - Type mismatch (requested type doesn't match interned type)
942    ///
943    /// # Type Safety
944    ///
945    /// The method uses [`StableTypeID`] to ensure type safety. Attempting to
946    /// retrieve a value with the wrong type returns `None`, preventing type
947    /// confusion even if hash values match.
948    ///
949    /// # Serialization Context
950    ///
951    /// During serialization, [`Interned<T>`] values are encoded as:
952    /// 1. **First occurrence**: Full value + hash
953    /// 2. **Subsequent occurrences**: Hash reference only
954    ///
955    /// During deserialization, hash references are resolved using this method
956    /// to reconstruct the original value structure.
957    ///
958    /// # Example
959    ///
960    /// ```ignore
961    /// // Intern a value
962    /// let original = interner.intern("data".to_string());
963    /// let hash = interner.hash_128(&*original);
964    ///
965    /// // Retrieve by hash
966    /// let retrieved = interner.get_from_hash::<String>(hash)
967    ///     .expect("value should still be alive");
968    ///
969    /// assert!(Arc::ptr_eq(&original.0, &retrieved.0));
970    ///
971    /// // After all handles drop, lookup returns None
972    /// drop(original);
973    /// drop(retrieved);
974    /// assert!(interner.get_from_hash::<String>(hash).is_none());
975    /// ```
976    #[must_use]
977    pub fn get_from_hash<T: Identifiable + Send + Sync + 'static + ?Sized>(
978        &self,
979        hash_128: Compact128,
980    ) -> Option<Interned<T>> {
981        let read_shard = self.obtain_read_shard::<T>();
982        let typed_shard = read_shard
983            .typed_shard
984            .downcast_ref::<TypedShard<T>>()
985            .expect("should be correct type");
986
987        let shard_index = typed_shard.shard_index(hash_128.low());
988        let read_lock = typed_shard.read_shard(shard_index);
989
990        read_lock
991            .get(&hash_128)
992            .and_then(std::sync::Weak::upgrade)
993            .map(|arc| Interned(arc))
994    }
995
996    /// Interns a value, returning a reference-counted handle to the shared
997    /// allocation.
998    ///
999    /// If an equal value (as determined by stable hash) has already been
1000    /// interned and is still alive, this method returns a handle to the
1001    /// existing allocation. Otherwise, it stores the new value and returns a
1002    /// handle to it.
1003    ///
1004    /// # Equality Semantics
1005    ///
1006    /// Values are considered equal if:
1007    /// 1. They have the same type (same [`StableTypeID`])
1008    /// 2. They have the same stable hash (128-bit)
1009    ///
1010    /// **Important**: This uses **hash equality**, not structural equality
1011    /// (`PartialEq`). While 128-bit hash collisions are astronomically
1012    /// unlikely, they are theoretically possible. In practice, this is not a
1013    /// concern for most applications.
1014    ///
1015    /// # Type Parameters
1016    ///
1017    /// * `T` - The type of value to intern. Must implement:
1018    ///   - [`StableHash`]: For computing the deterministic content hash
1019    ///   - [`Identifiable`]: For the stable type ID
1020    ///   - `Send + Sync + 'static`: For safe sharing across threads
1021    ///
1022    /// # Parameters
1023    ///
1024    /// * `value` - The value to intern. Takes ownership.
1025    ///
1026    /// # Returns
1027    ///
1028    /// An [`Interned<T>`] handle to the interned value. If an equal value was
1029    /// already interned, the returned handle points to the existing allocation.
1030    ///
1031    /// # Thread Safety
1032    ///
1033    /// This method is fully thread-safe and can be called concurrently:
1034    /// - If two threads intern the same value simultaneously, only one
1035    ///   allocation is created
1036    /// - Both threads receive handles to the same allocation
1037    /// - The losing thread's value is dropped
1038    ///
1039    /// # Performance
1040    ///
1041    /// * **Time complexity**: O(1) expected, O(n) worst case (hash table)
1042    /// * **Hashing cost**: O(size of value) - entire value is hashed
1043    /// * **Lock contention**: Only affects the specific shard for this hash
1044    /// * **Memory**: Reuses existing allocation if value already interned
1045    ///
1046    /// # Trade-offs
1047    ///
1048    /// **Slower than `Arc::new`** because:
1049    /// - Must hash the entire value
1050    /// - Requires hash table lookup
1051    /// - May need to acquire locks
1052    ///
1053    /// **Saves memory when**:
1054    /// - Many duplicate values exist
1055    /// - Values are large
1056    /// - Long-lived values with many references
1057    ///
1058    /// # Example
1059    ///
1060    /// ```ignore
1061    /// // First intern - creates new allocation
1062    /// let s1 = interner.intern("hello".to_string());
1063    ///
1064    /// // Second intern - reuses existing allocation
1065    /// let s2 = interner.intern("hello".to_string());
1066    ///
1067    /// // Same pointer, same allocation
1068    /// assert!(Arc::ptr_eq(&s1.0, &s2.0));
1069    ///
1070    /// // Different value - new allocation
1071    /// let s3 = interner.intern("world".to_string());
1072    /// assert!(!Arc::ptr_eq(&s1.0, &s3.0));
1073    /// ```
1074    pub fn intern<T: StableHash + Identifiable + Send + Sync + 'static>(
1075        &self,
1076        value: T,
1077    ) -> Interned<T> {
1078        let hash_128 = self.hash_128(&value);
1079        let read_shard = self.obtain_read_shard::<T>();
1080        let typed_shard = read_shard
1081            .typed_shard
1082            .downcast_ref::<TypedShard<T>>()
1083            .expect("should be correct type");
1084
1085        let shard_index = typed_shard.shard_index(hash_128.low());
1086
1087        // First, try to find an existing interned value
1088        {
1089            let read_shard = typed_shard.read_shard(shard_index);
1090
1091            if let Some(arc) =
1092                read_shard.get(&hash_128).and_then(std::sync::Weak::upgrade)
1093            {
1094                return Interned(arc);
1095            }
1096        }
1097
1098        // Not found, so insert a new one
1099        {
1100            let mut write_shard = typed_shard.write_shard(shard_index);
1101
1102            match write_shard.entry(hash_128) {
1103                // double check in case another thread inserted it
1104                std::collections::hash_map::Entry::Occupied(mut entry) => {
1105                    if let Some(arc) = entry.get().upgrade() {
1106                        return Interned(arc);
1107                    }
1108
1109                    // The weak reference is dead, we can replace it
1110                    let arc = Arc::new(value);
1111                    let weak = Arc::downgrade(&arc);
1112
1113                    entry.insert(weak);
1114
1115                    Interned(arc)
1116                }
1117
1118                std::collections::hash_map::Entry::Vacant(entry) => {
1119                    let arc = Arc::new(value);
1120                    let weak = Arc::downgrade(&arc);
1121
1122                    entry.insert(weak);
1123
1124                    Interned(arc)
1125                }
1126            }
1127        }
1128    }
1129
1130    /// Interns an unsized value, returning a reference-counted handle to the
1131    /// shared allocation.
1132    ///
1133    /// This method is similar to [`intern`](Self::intern) but supports unsized
1134    /// types like `str` and `[T]`. The value is provided as a sized type `Q`
1135    /// that can be borrowed as `T` and converted into `Arc<T>`.
1136    ///
1137    /// # Type Parameters
1138    ///
1139    /// * `T` - The unsized type to intern (e.g., `str`, `[u8]`). Must
1140    ///   implement:
1141    ///   - [`StableHash`]: For computing the deterministic content hash
1142    ///   - [`Identifiable`]: For the stable type ID
1143    ///   - `Send + Sync + 'static`: For safe sharing across threads
1144    ///
1145    /// * `Q` - The sized type that owns the value. Must implement:
1146    ///   - [`Borrow<T>`]: To access the unsized value for hashing
1147    ///   - `Send + Sync + 'static`: For safe sharing across threads
1148    ///   - `Arc<T>: From<Q>`: For efficient conversion to `Arc<T>`
1149    ///
1150    /// # Common Type Combinations
1151    ///
1152    /// | Unsized `T` | Sized `Q` options |
1153    /// |-------------|-------------------|
1154    /// | `str` | `String`, `Box<str>`, `Cow<'static, str>` |
1155    /// | `[u8]` | `Vec<u8>`, `Box<[u8]>` |
1156    /// | `[T]` | `Vec<T>`, `Box<[T]>` |
1157    ///
1158    /// # Example
1159    ///
1160    /// ```ignore
1161    /// // Intern strings as str slices
1162    /// let s1: Interned<str> = interner.intern_unsized("hello".to_string());
1163    /// let s2: Interned<str> = interner.intern_unsized("hello".to_string());
1164    ///
1165    /// // Same allocation
1166    /// assert!(Arc::ptr_eq(&s1.0, &s2.0));
1167    ///
1168    /// // Transparent access via Deref
1169    /// assert_eq!(&*s1, "hello");
1170    /// assert_eq!(s1.len(), 5);
1171    ///
1172    /// // Works with byte slices too
1173    /// let bytes: Interned<[u8]> = interner.intern_unsized(vec![1, 2, 3]);
1174    /// assert_eq!(&*bytes, &[1, 2, 3]);
1175    /// ```
1176    ///
1177    /// # Performance
1178    ///
1179    /// Same as [`intern`](Self::intern) - O(1) expected time with hashing
1180    /// overhead proportional to value size.
1181    pub fn intern_unsized<
1182        T: StableHash + Identifiable + Send + Sync + 'static + ?Sized,
1183        Q: Borrow<T> + Send + Sync + 'static,
1184    >(
1185        &self,
1186        value: Q,
1187    ) -> Interned<T>
1188    where
1189        Arc<T>: From<Q>,
1190    {
1191        let hash_128 = self.hash_128(value.borrow());
1192        let read_shard = self.obtain_read_shard::<T>();
1193        let typed_shard = read_shard
1194            .typed_shard
1195            .downcast_ref::<TypedShard<T>>()
1196            .expect("should be correct type");
1197
1198        let shard_index = typed_shard.shard_index(hash_128.low());
1199
1200        // First, try to find an existing interned value
1201        {
1202            let read_shard = typed_shard.read_shard(shard_index);
1203
1204            if let Some(arc) =
1205                read_shard.get(&hash_128).and_then(std::sync::Weak::upgrade)
1206            {
1207                return Interned(arc);
1208            }
1209        }
1210
1211        // Not found, so insert a new one
1212        {
1213            let mut write_shard = typed_shard.write_shard(shard_index);
1214
1215            match write_shard.entry(hash_128) {
1216                // double check in case another thread inserted it
1217                std::collections::hash_map::Entry::Occupied(mut entry) => {
1218                    if let Some(arc) = entry.get().upgrade() {
1219                        return Interned(arc);
1220                    }
1221
1222                    // The weak reference is dead, we can replace it
1223                    let arc = Arc::from(value);
1224                    let weak = Arc::downgrade(&arc);
1225
1226                    entry.insert(weak);
1227
1228                    Interned(arc)
1229                }
1230
1231                std::collections::hash_map::Entry::Vacant(entry) => {
1232                    let arc = Arc::from(value);
1233                    let weak = Arc::downgrade(&arc);
1234
1235                    entry.insert(weak);
1236
1237                    Interned(arc)
1238                }
1239            }
1240        }
1241    }
1242}
1243
1244impl std::fmt::Debug for Repr {
1245    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1246        f.debug_struct("Interner").finish_non_exhaustive()
1247    }
1248}
1249
1250fn vacuum_shard<T: ?Sized + 'static>(shard: &dyn Any) {
1251    let typed_shard = shard
1252        .downcast_ref::<TypedShard<T>>()
1253        .expect("invalid shard type for vacuum");
1254
1255    for mut write_shard in typed_shard.iter_write_shards() {
1256        write_shard.retain(|_, weak_value| weak_value.upgrade().is_some());
1257    }
1258}
1259
1260/// Vacuum all shards in the interner, removing dead weak references.
1261fn vacuum_shards(shards: &WholeShard) {
1262    for read_shard in shards.iter_read_shards() {
1263        for shard in read_shard.values() {
1264            (shard.vacuum_fn)(&*shard.typed_shard);
1265        }
1266    }
1267}
1268
1269/// The main loop for the background vacuum thread.
1270fn vacuum_thread_loop(
1271    repr_weak: &Weak<Repr>,
1272    command_rx: &Receiver<VacuumCommand>,
1273    vacuum_interval: Duration,
1274) {
1275    loop {
1276        // Wait for a command with timeout as the vacuum interval
1277        match command_rx.recv_timeout(vacuum_interval) {
1278            Ok(VacuumCommand::Stop)
1279            | Err(crossbeam_channel::RecvTimeoutError::Disconnected) => {
1280                // Stop signal received or channel disconnected, exit the loop
1281                return;
1282            }
1283            Ok(VacuumCommand::Trigger)
1284            | Err(crossbeam_channel::RecvTimeoutError::Timeout) => {
1285                // Trigger signal received, run vacuum immediately
1286            }
1287        }
1288
1289        // Try to upgrade the weak reference to run vacuum
1290        if let Some(repr) = repr_weak.upgrade() {
1291            vacuum_shards(&repr.shards);
1292        } else {
1293            // The interner has been dropped, exit the loop
1294            return;
1295        }
1296    }
1297}
1298
1299impl Repr {
1300    /// Creates a new interner with specified sharding and stable hasher.
1301    ///
1302    /// # Parameters
1303    ///
1304    /// * `shard_amount` - The number of shards for concurrent access. Each
1305    ///   shard has its own lock, so more shards reduce contention at the cost
1306    ///   of memory overhead.
1307    ///
1308    ///   **Recommended values:**
1309    ///   - Single-threaded: 1-4 shards
1310    ///   - Multi-threaded (typical): 16-32 shards
1311    ///   - High contention: 64-128 shards
1312    ///   - Powers of 2 work best (efficient bit masking)
1313    ///
1314    /// * `hasher_builder` - Builder for creating stable hashers. Must produce
1315    ///   deterministic 128-bit hashes. The same input must always produce the
1316    ///   same hash across all program executions for serialization to work
1317    ///   correctly.
1318    ///
1319    /// # Panics
1320    ///
1321    /// Panics if `shard_amount` is not a power of two.
1322    ///
1323    /// # Example
1324    ///
1325    /// ```ignore
1326    /// use qbice_stable_hash::BuildStableHasher128;
1327    ///
1328    /// // Create interner with 16 shards
1329    /// let interner = Interner::new(16, BuildStableHasher128::default());
1330    /// ```
1331    pub fn new<S: BuildStableHasher + Send + Sync + 'static>(
1332        shard_amount: usize,
1333        hasher_builder: S,
1334    ) -> Self
1335    where
1336        <S as BuildStableHasher>::Hasher: StableHasher<Hash = u128>,
1337    {
1338        Self {
1339            shards: Sharded::new(shard_amount, |_| HashMap::default()),
1340            hasher_builder_erased: Box::new(hasher_builder),
1341            stable_hash_fn: stable_hash::<S>,
1342            vacuum_command: None,
1343        }
1344    }
1345
1346    /// Creates a new interner with vacuum thread support.
1347    pub fn new_with_vacuum<S: BuildStableHasher + Send + Sync + 'static>(
1348        shard_amount: usize,
1349        hasher_builder: S,
1350        vacuum_command: Sender<VacuumCommand>,
1351    ) -> Self
1352    where
1353        <S as BuildStableHasher>::Hasher: StableHasher<Hash = u128>,
1354    {
1355        Self {
1356            shards: Sharded::new(shard_amount, |_| HashMap::default()),
1357            hasher_builder_erased: Box::new(hasher_builder),
1358            stable_hash_fn: stable_hash::<S>,
1359            vacuum_command: Some(vacuum_command),
1360        }
1361    }
1362}
1363
1364#[cfg(test)]
1365mod test;