qbice_storage/intern.rs
1//! A thread-safe interning system for deduplicating values based on stable
2//! hashing.
3//!
4//! Interning is a memory optimization technique where equal values are stored
5//! only once in memory, with all references pointing to the same allocation.
6//! This module provides a concurrent interner using stable hashing to identify
7//! equal values deterministically across program executions.
8//!
9//! # Key Components
10//!
11//! - [`InternedID`]: A unique identifier combining type ID and content hash
12//! - [`Interned<T>`]: A reference-counted handle providing transparent value
13//! access
14//! - [`Interner`]: The core deduplication engine with sharded concurrent access
15//! - [`SharedInterner`]: An `Arc`-wrapped interner for easy sharing
16//!
17//! # Benefits of Interning
18//!
19//! ## Memory Savings
20//!
21//! When you have many duplicate values, interning can dramatically reduce
22//! memory usage:
23//! - 1000 copies of "hello" → 1 allocation + 1000 small handles
24//! - Large duplicate structures → single allocation shared by all references
25//!
26//! ## Unsized Type Support
27//!
28//! The interner supports interning unsized types like `str` and `[T]` through
29//! the [`Interner::intern_unsized`] method. This is especially useful for
30//! string interning where you want to store `Interned<str>` rather than
31//! `Interned<String>`, saving the extra indirection of the `String` wrapper:
32//!
33//! ```ignore
34//! // Intern a str slice directly
35//! let s1: Interned<str> = interner.intern_unsized("hello world".to_string());
36//! let s2: Interned<str> = interner.intern_unsized("hello world".to_string());
37//!
38//! // Same allocation
39//! assert!(Arc::ptr_eq(&s1.0, &s2.0));
40//!
41//! // Can also use Box<str> or any type that converts to Arc<str>
42//! let s3: Interned<str> = interner.intern_unsized(Box::<str>::from("hello world"));
43//! ```
44//!
45//! ## Serialization Optimization
46//!
47//! The interning system integrates with the serialization framework:
48//! - First occurrence: full value serialized
49//! - Subsequent occurrences: only hash reference serialized
50//! - Automatic deduplication during deserialization
51//!
52//! ## Cross-Execution Stability
53//!
54//! Using stable hashing ensures:
55//! - Same values produce same hashes across program runs
56//! - Serialized data can reference values by hash
57//! - Deterministic behavior for testing and debugging
58//!
59//! # When to Use Interning
60//!
61//! **Use interning when:**
62//! - Many duplicate immutable values exist (strings, configs, AST nodes)
63//! - Values are compared frequently by identity
64//! - Serialization size matters (network protocols, caching)
65//! - Memory usage is more important than allocation speed
66//!
67//! **Use regular `Arc`/`Rc` when:**
68//! - Values are rarely duplicated
69//! - Cross-execution stability isn't needed
70//! - Allocation performance is critical (hashing overhead matters)
71//! - Values are mutable
72//!
73//! # Architecture
74//!
75//! The interner uses:
76//! - **Sharding**: Multiple independent hash tables to reduce lock contention
77//! - **Weak references**: Automatic cleanup when values are no longer used
78//! - **Stable hashing**: Deterministic 128-bit hashes for cross-run stability
79//! - **Type safety**: Type IDs prevent hash collisions between different types
80//!
81//! # Example
82//!
83//! ```ignore
84//! use qbice_storage::intern::Interner;
85//! use qbice_stable_hash::BuildStableHasher128;
86//!
87//! // Create an interner
88//! let interner = Interner::new(16, BuildStableHasher128::default());
89//!
90//! // Intern some values
91//! let s1 = interner.intern("hello world".to_string());
92//! let s2 = interner.intern("hello world".to_string());
93//!
94//! // Same allocation: s1 and s2 point to the exact same memory
95//! assert!(Arc::ptr_eq(&s1.0, &s2.0));
96//!
97//! // Transparent access via Deref
98//! assert_eq!(&*s1, "hello world");
99//!
100//! // When all Interned handles are dropped, the value is freed
101//! drop(s1);
102//! drop(s2);
103//! // Memory is now reclaimed
104//! ```
105//!
106//! # Serialization Integration
107//!
108//! ```ignore
109//! use qbice_serialize::Plugin;
110//!
111//! // Add interner to serialization plugin
112//! let mut plugin = Plugin::new();
113//! plugin.insert(interner.clone());
114//!
115//! // Encode data with automatic deduplication
116//! let data = vec![s1.clone(), s1.clone(), s2.clone()];
117//! // s1's value is serialized once, then referenced by hash
118//! encoder.encode(&data, &plugin, &mut session)?;
119//! ```
120
121use std::{
122 any::Any,
123 borrow::Borrow,
124 collections::{HashMap, hash_map::Entry},
125 hash::Hash,
126 ops::Deref,
127 path::Path,
128 sync::{Arc, Weak},
129 thread::JoinHandle,
130 time::Duration,
131};
132
133use crossbeam_channel::{Receiver, Sender, unbounded};
134use fxhash::{FxBuildHasher, FxHashSet};
135use parking_lot::{MappedRwLockReadGuard, RwLockReadGuard};
136use qbice_serialize::{
137 Decode, Decoder, Encode, Encoder, Plugin,
138 session::{Session, SessionKey},
139};
140use qbice_stable_hash::{
141 BuildStableHasher, Compact128, StableHash, StableHasher,
142};
143use qbice_stable_type_id::{Identifiable, StableTypeID};
144
145use crate::sharded::Sharded;
146
147/// A globally unique identifier for an interned value.
148///
149/// `InternedID` combines two components to create a collision-resistant
150/// identifier:
151/// 1. **Type ID** ([`StableTypeID`]): Uniquely identifies the Rust type
152/// 2. **Content Hash** ([`Compact128`]): 128-bit hash of the value's content
153///
154/// This dual-component design prevents type confusion: values of different
155/// types with the same content hash will still have different `InternedID`s.
156///
157/// # Structure
158///
159/// ```text
160/// InternedID {
161/// stable_type_id: 0x1234_5678_9abc_def0, // Type's stable ID
162/// hash_128: 0xfedC_BA98_7654_3210_..., // 128-bit content hash
163/// }
164/// ```
165///
166/// # Example Scenario
167///
168/// ```ignore
169/// // These would have the same content hash but different type IDs
170/// let string_42 = interner.intern("42".to_string());
171/// let int_42 = interner.intern(42_i32);
172///
173/// // Different InternedIDs prevent type confusion:
174/// // - string_42's ID: (String type ID, hash("42"))
175/// // - int_42's ID: (i32 type ID, hash(42))
176/// ```
177///
178/// # Hash Collisions
179///
180/// While 128-bit hashes make collisions astronomically unlikely, the
181/// combination with type IDs provides an additional layer of protection.
182/// Different types cannot collide even if their content hashes match.
183///
184/// # Serialization
185///
186/// When serializing [`Interned<T>`], subsequent occurrences of the same value
187/// are encoded as references using this ID, significantly reducing serialized
188/// size for duplicate data.
189#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
190pub struct InternedID {
191 stable_type_id: StableTypeID,
192 hash_128: Compact128,
193}
194
195/// A reference-counted handle to an interned value with transparent access.
196///
197/// `Interned<T>` wraps an `Arc<T>` and provides seamless access to the
198/// underlying value through the [`Deref`] trait. When multiple `Interned<T>`
199/// instances contain equal values (as determined by stable hash), they share
200/// the same underlying allocation when created through the same [`Interner`].
201///
202/// # Unsized Type Support
203///
204/// `Interned<T>` supports unsized types like `str` and `[T]`. Use
205/// [`Interner::intern_unsized`] to create `Interned<str>` or `Interned<[T]>`
206/// values. This avoids the extra layer of indirection from wrapper types like
207/// `String` or `Vec<T>`.
208///
209/// # Memory Management
210///
211/// - **Automatic deallocation**: The value is freed when all `Interned<T>`
212/// handles are dropped
213/// - **Weak references**: The interner only holds weak references, allowing
214/// garbage collection of unused values
215/// - **No memory leaks**: Values without active handles are automatically
216/// reclaimed
217///
218/// # Cloning
219///
220/// Cloning is extremely cheap (O(1)) as it only increments the reference count:
221/// ```ignore
222/// let s1 = interner.intern("data".to_string());
223/// let s2 = s1.clone(); // Fast: just increments refcount
224/// ```
225///
226/// # Comparison with `Arc<T>`
227///
228/// | Aspect | `Arc<T>` | `Interned<T>` |
229/// |--------|----------|---------------|
230/// | Deduplication | No (each `Arc::new` creates new allocation) | Yes (equal values share allocation) |
231/// | Overhead | Reference count only | Hash computation + lookup |
232/// | Memory usage | Higher for duplicates | Lower for duplicates |
233/// | Creation speed | Faster | Slower (requires hashing) |
234/// | Cross-execution stability | No | Yes (with stable hashing) |
235///
236/// # Serialization
237///
238/// When using [`Encode`]/[`Decode`] traits, `Interned<T>` automatically
239/// deduplicates values:
240/// - **First occurrence**: Serialized in full
241/// - **Subsequent occurrences**: Serialized as hash reference only
242///
243/// This dramatically reduces serialized size when many duplicate values exist.
244///
245/// # Example
246///
247/// ```ignore
248/// // Create interned values
249/// let s1 = interner.intern("hello".to_string());
250/// let s2 = interner.intern("hello".to_string());
251///
252/// // They share the same allocation
253/// assert!(Arc::ptr_eq(&s1.0, &s2.0));
254///
255/// // Transparent access
256/// assert_eq!(&*s1, "hello");
257/// assert_eq!(s1.len(), 5);
258///
259/// // Get owned copy if needed
260/// let owned: String = s1.clone_inner();
261/// ```
262#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, StableHash)]
263#[stable_hash_crate(qbice_stable_hash)]
264pub struct Interned<T: ?Sized>(Arc<T>);
265
266impl<T: ?Sized> Clone for Interned<T> {
267 fn clone(&self) -> Self { Self(self.0.clone()) }
268}
269
270impl<T> Interned<T> {
271 /// Returns an owned clone of the interned value.
272 #[must_use]
273 pub fn clone_inner(&self) -> T
274 where
275 T: Clone,
276 {
277 self.0.as_ref().clone()
278 }
279}
280
281impl<T: ?Sized> Deref for Interned<T> {
282 type Target = T;
283
284 fn deref(&self) -> &Self::Target { &self.0 }
285}
286
287impl<T: ?Sized> AsRef<T> for Interned<T> {
288 fn as_ref(&self) -> &T { &self.0 }
289}
290
291impl<T: ?Sized> Borrow<T> for Interned<T> {
292 fn borrow(&self) -> &T { &self.0 }
293}
294
295enum WiredInterned<T> {
296 Source(T),
297 Reference(Compact128),
298}
299
300impl<T: Encode> Encode for WiredInterned<T> {
301 fn encode<E: Encoder + ?Sized>(
302 &self,
303 encoder: &mut E,
304 plugin: &Plugin,
305 session: &mut Session,
306 ) -> std::io::Result<()> {
307 match self {
308 Self::Source(value) => {
309 encoder.emit_u8(0)?;
310 value.encode(encoder, plugin, session)
311 }
312 Self::Reference(hash) => {
313 encoder.emit_u8(1)?;
314 hash.encode(encoder, plugin, session)
315 }
316 }
317 }
318}
319
320impl<T: Decode> Decode for WiredInterned<T> {
321 fn decode<D: Decoder + ?Sized>(
322 decoder: &mut D,
323 plugin: &Plugin,
324 session: &mut Session,
325 ) -> std::io::Result<Self> {
326 let tag = decoder.read_u8()?;
327 match tag {
328 0 => {
329 let value = T::decode(decoder, plugin, session)?;
330 Ok(Self::Source(value))
331 }
332 1 => {
333 let hash = Compact128::decode(decoder, plugin, session)?;
334 Ok(Self::Reference(hash))
335 }
336
337 _ => Err(std::io::Error::new(
338 std::io::ErrorKind::InvalidData,
339 "invalid tag for WiredInterned",
340 )),
341 }
342 }
343}
344
345impl Decode for Interned<str> {
346 fn decode<D: Decoder + ?Sized>(
347 decoder: &mut D,
348 plugin: &Plugin,
349 session: &mut Session,
350 ) -> std::io::Result<Self> {
351 let wired =
352 WiredInterned::<Box<str>>::decode(decoder, plugin, session)?;
353
354 let interner = plugin.get::<Interner>().expect(
355 "`SharedInterner` plugin missing for decoding `Interned<str>`",
356 );
357
358 let value = match wired {
359 WiredInterned::Source(source) => interner.intern_unsized(source),
360
361 WiredInterned::Reference(compact128) => interner
362 .get_from_hash::<str>(compact128)
363 .expect("referenced interned value not found in interner"),
364 };
365
366 Ok(value)
367 }
368}
369
370impl<T: Decode + StableHash + Identifiable + Send + Sync + 'static> Decode
371 for Interned<[T]>
372{
373 fn decode<D: Decoder + ?Sized>(
374 decoder: &mut D,
375 plugin: &Plugin,
376 session: &mut Session,
377 ) -> std::io::Result<Self> {
378 let wired =
379 WiredInterned::<Box<[T]>>::decode(decoder, plugin, session)?;
380
381 let interner = plugin.get::<Interner>().expect(
382 "`SharedInterner` plugin missing for decoding `Interned<[T]>`",
383 );
384
385 let value = match wired {
386 WiredInterned::Source(source) => interner.intern_unsized(source),
387
388 WiredInterned::Reference(compact128) => interner
389 .get_from_hash::<[T]>(compact128)
390 .expect("referenced interned value not found in interner"),
391 };
392
393 Ok(value)
394 }
395}
396
397impl Decode for Interned<Path> {
398 fn decode<D: Decoder + ?Sized>(
399 decoder: &mut D,
400 plugin: &Plugin,
401 session: &mut Session,
402 ) -> std::io::Result<Self> {
403 let wired =
404 WiredInterned::<Box<Path>>::decode(decoder, plugin, session)?;
405
406 let interner = plugin.get::<Interner>().expect(
407 "`SharedInterner` plugin missing for decoding `Interned<Path>`",
408 );
409
410 let value = match wired {
411 WiredInterned::Source(source) => interner.intern_unsized(source),
412
413 WiredInterned::Reference(compact128) => interner
414 .get_from_hash::<Path>(compact128)
415 .expect("referenced interned value not found in interner"),
416 };
417
418 Ok(value)
419 }
420}
421
422/// A session key for tracking seen interned IDs during encoding.
423struct SeenInterned;
424
425impl SessionKey for SeenInterned {
426 type Value = FxHashSet<InternedID>;
427}
428
429impl<T: Identifiable + StableHash + Encode + Send + Sync + 'static + ?Sized>
430 Encode for Interned<T>
431{
432 fn encode<E: Encoder + ?Sized>(
433 &self,
434 encoder: &mut E,
435 plugin: &Plugin,
436 session: &mut Session,
437 ) -> std::io::Result<()> {
438 let interner = plugin.get::<Interner>().expect(
439 "`SharedInterner` plugin missing for encoding `Interned<T>`",
440 );
441
442 let value: &T = &self.0;
443 let compact_128 = interner.hash_128(value);
444
445 let seen_interned = session.get_mut_or_default::<SeenInterned>();
446 let first = seen_interned.insert(InternedID {
447 stable_type_id: T::STABLE_TYPE_ID,
448 hash_128: compact_128,
449 });
450
451 if first {
452 // serialize the full value
453 encoder.emit_u8(0)?;
454 value.encode(encoder, plugin, session)
455 } else {
456 // serialize only the reference
457 encoder.emit_u8(1)?;
458 compact_128.encode(encoder, plugin, session)
459 }
460 }
461}
462
463impl<T: Identifiable + StableHash + Decode + Send + Sync + 'static> Decode
464 for Interned<T>
465{
466 fn decode<D: Decoder + ?Sized>(
467 decoder: &mut D,
468 plugin: &Plugin,
469 session: &mut Session,
470 ) -> std::io::Result<Self> {
471 let wired = WiredInterned::<T>::decode(decoder, plugin, session)?;
472 let interner = plugin.get::<Interner>().expect(
473 "`SharedInterner` plugin missing for decoding `Interned<T>`",
474 );
475
476 let value = match wired {
477 WiredInterned::Source(source) => interner.intern(source),
478
479 WiredInterned::Reference(compact128) => interner
480 .get_from_hash::<T>(compact128)
481 .expect("referenced interned value not found in interner"),
482 };
483
484 Ok(value)
485 }
486}
487
488fn stable_hash<H: BuildStableHasher + 'static>(
489 build_hasher: &dyn Any,
490 value: &dyn DynStableHash,
491) -> u128
492where
493 <H as BuildStableHasher>::Hasher: StableHasher<Hash = u128>,
494{
495 let build_hasher =
496 build_hasher.downcast_ref::<H>().expect("invalid hasher type");
497
498 let mut hasher = build_hasher.build_stable_hasher();
499 value.stable_hash_dyn(&mut hasher);
500 hasher.finish()
501}
502
503trait DynStableHash {
504 fn stable_hash_dyn(&self, hasher: &mut dyn StableHasher<Hash = u128>);
505}
506
507impl<T: StableHash + ?Sized> DynStableHash for T {
508 fn stable_hash_dyn(&self, hasher: &mut dyn StableHasher<Hash = u128>) {
509 StableHash::stable_hash(self, hasher);
510 }
511}
512
513struct Shard {
514 typed_shard: Box<dyn Any + Send + Sync>,
515 vacuum_fn: fn(&dyn Any),
516}
517
518type TypedShard<T> = Sharded<HashMap<Compact128, Weak<T>, FxBuildHasher>>;
519type WholeShard = Sharded<HashMap<StableTypeID, Shard, FxBuildHasher>>;
520
521struct Repr {
522 shards: WholeShard,
523
524 hasher_builder_erased: Box<dyn Any + Send + Sync>,
525 stable_hash_fn: fn(&dyn Any, &dyn DynStableHash) -> u128,
526
527 /// Sender to signal the vacuum thread.
528 vacuum_command: Option<Sender<VacuumCommand>>,
529}
530
531/// A thread-safe interner for deduplicating values based on stable hashing.
532///
533/// The `Interner` uses sharding to enable high-concurrency access from multiple
534/// threads with minimal contention. Values are identified by their 128-bit
535/// stable hash combined with type ID, ensuring safe and deterministic
536/// deduplication.
537///
538/// # How It Works
539///
540/// 1. **Hash**: When interning a value, compute its stable 128-bit hash
541/// 2. **Lookup**: Check if a value with that hash already exists in the
542/// appropriate shard
543/// 3. **Reuse or Store**:
544/// - If found and alive: return handle to existing value
545/// - Otherwise: store new value and return handle to it
546///
547/// # Sharding Architecture
548///
549/// ```text
550/// Interner
551/// ├─ Shard 0: HashMap<InternedID, Weak<T>>
552/// ├─ Shard 1: HashMap<InternedID, Weak<T>>
553/// ├─ ...
554/// └─ Shard N: HashMap<InternedID, Weak<T>>
555/// ```
556///
557/// Each shard:
558/// - Has its own lock (reducing contention)
559/// - Handles a subset of hash values
560/// - Can be accessed in parallel with other shards
561///
562/// # Weak References and Garbage Collection
563///
564/// The interner stores [`Weak`] references, not strong references:
565/// - Values are kept alive only by [`Interned<T>`] handles
566/// - When all handles are dropped, the value is deallocated
567/// - Subsequent intern calls for the same value create a new allocation
568/// - Dead weak references can be cleaned up via [`vacuum`](Self::vacuum) or
569/// automatically by a background vacuum thread
570///
571/// This prevents memory leaks from long-lived interners accumulating values.
572///
573/// # Background Vacuum Thread
574///
575/// The interner can optionally run a background thread that periodically
576/// removes dead weak references from the shards. Use
577/// [`new_with_vacuum`](Self::new_with_vacuum) to create an interner with
578/// automatic cleanup:
579///
580/// ```ignore
581/// use std::time::Duration;
582///
583/// // Vacuum every 60 seconds
584/// let interner = Interner::new_with_vacuum(
585/// 16,
586/// BuildStableHasher128::default(),
587/// Duration::from_secs(60),
588/// );
589///
590/// // Or trigger vacuum manually
591/// interner.vacuum();
592/// ```
593///
594/// # Thread Safety
595///
596/// Fully thread-safe and can be shared using `.clone()` directly. Multiple
597/// threads can safely call [`intern`](Self::intern) concurrently. Sharding
598/// ensures that operations on different hash buckets don't block each other.
599///
600/// # Performance Characteristics
601///
602/// - **Intern**: O(1) expected, requires hashing entire value
603/// - **Lookup**: O(1) expected, requires hashing entire value
604/// - **Memory**: O(unique values + shard overhead)
605/// - **Concurrency**: Excellent when shards ≥ typical concurrent threads
606///
607/// # Example
608///
609/// ```ignore
610/// use qbice_stable_hash::BuildStableHasher128;
611///
612/// let interner = Interner::new(16, BuildStableHasher128::default());
613///
614/// // Intern values
615/// let v1 = interner.intern("data".to_string());
616/// let v2 = interner.intern("data".to_string());
617///
618/// // Same allocation
619/// assert!(Arc::ptr_eq(&v1.0, &v2.0));
620///
621/// // When all handles drop, value is freed
622/// drop(v1);
623/// drop(v2);
624/// // "data" is now deallocated
625/// ```
626#[derive(Debug, Clone)]
627pub struct Interner {
628 inner: Arc<Repr>,
629 /// Handle to the vacuum thread (if any). Kept for lifetime management.
630 _vacuum_thread: Option<Arc<VacuumThreadHandle>>,
631}
632
633/// Command sent to the vacuum thread.
634enum VacuumCommand {
635 /// Trigger an immediate vacuum cycle.
636 Trigger,
637 /// Stop the vacuum thread.
638 Stop,
639}
640
641/// Handle to manage the vacuum thread's lifecycle.
642struct VacuumThreadHandle {
643 command_sender: Sender<VacuumCommand>,
644 join_handle: Option<JoinHandle<()>>,
645}
646
647impl std::fmt::Debug for VacuumThreadHandle {
648 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
649 f.debug_struct("VacuumThreadHandle").finish_non_exhaustive()
650 }
651}
652
653impl Drop for VacuumThreadHandle {
654 fn drop(&mut self) {
655 // Signal the thread to stop
656 let _ = self.command_sender.send(VacuumCommand::Stop);
657
658 // Wait for the thread to finish
659 if let Some(handle) = self.join_handle.take() {
660 let _ = handle.join();
661 }
662 }
663}
664
665impl Interner {
666 /// Creates a new shared interner with the specified sharding and hasher.
667 ///
668 /// This creates an interner without a background vacuum thread. Dead weak
669 /// references will accumulate until you call [`vacuum`](Self::vacuum)
670 /// manually or until they are replaced by new intern calls.
671 ///
672 /// For automatic cleanup, use [`new_with_vacuum`](Self::new_with_vacuum).
673 ///
674 /// # Parameters
675 ///
676 /// - `shard_amount`: The number of shards for concurrent access. More
677 /// shards reduce contention but increase memory overhead. Recommended:
678 /// 16-32 for multi-threaded use, or the number of CPU cores.
679 /// - `hasher_builder`: The builder for creating stable hashers used to hash
680 /// interned values. Must produce deterministic 128-bit hashes.
681 ///
682 /// # Example
683 ///
684 /// ```ignore
685 /// use qbice_stable_hash::BuildStableHasher128;
686 ///
687 /// let interner = Interner::new(16, BuildStableHasher128::default());
688 /// ```
689 pub fn new<S: BuildStableHasher<Hash = u128> + Send + Sync + 'static>(
690 shard_amount: usize,
691 hasher_builder: S,
692 ) -> Self {
693 Self {
694 inner: Arc::new(Repr::new(shard_amount, hasher_builder)),
695 _vacuum_thread: None,
696 }
697 }
698
699 /// Creates a new shared interner with a background vacuum thread.
700 ///
701 /// The vacuum thread periodically removes dead weak references from the
702 /// interner's shards, freeing up memory from entries that are no longer
703 /// referenced.
704 ///
705 /// # Parameters
706 ///
707 /// - `shard_amount`: The number of shards for concurrent access. More
708 /// shards reduce contention but increase memory overhead. Recommended:
709 /// 16-32 for multi-threaded use, or the number of CPU cores.
710 /// - `hasher_builder`: The builder for creating stable hashers used to hash
711 /// interned values. Must produce deterministic 128-bit hashes.
712 /// - `vacuum_interval`: The duration between automatic vacuum runs. The
713 /// vacuum thread will sleep for this duration between cleanup cycles.
714 ///
715 /// # Thread Behavior
716 ///
717 /// - The vacuum thread is automatically stopped when the interner is
718 /// dropped
719 /// - You can still call [`vacuum`](Self::vacuum) manually for immediate
720 /// cleanup
721 /// - The vacuum process acquires write locks on shards, so very frequent
722 /// vacuuming may impact performance
723 ///
724 /// # Recommended Intervals
725 ///
726 /// - Low memory pressure: 5-10 minutes
727 /// - Moderate usage: 30-60 seconds
728 /// - High churn (many short-lived values): 5-15 seconds
729 ///
730 /// # Example
731 ///
732 /// ```ignore
733 /// use std::time::Duration;
734 /// use qbice_stable_hash::BuildStableHasher128;
735 ///
736 /// // Create interner with vacuum every 30 seconds
737 /// let interner = Interner::new_with_vacuum(
738 /// 16,
739 /// BuildStableHasher128::default(),
740 /// Duration::from_secs(30),
741 /// );
742 /// ```
743 pub fn new_with_vacuum<
744 S: BuildStableHasher<Hash = u128> + Send + Sync + 'static,
745 >(
746 shard_amount: usize,
747 hasher_builder: S,
748 vacuum_interval: Duration,
749 ) -> Self {
750 let (command_tx, command_rx) = unbounded::<VacuumCommand>();
751
752 let repr = Arc::new(Repr::new_with_vacuum(
753 shard_amount,
754 hasher_builder,
755 command_tx.clone(),
756 ));
757
758 let repr_weak = Arc::downgrade(&repr);
759
760 let join_handle = std::thread::Builder::new()
761 .name("interner-vacuum".to_string())
762 .spawn(move || {
763 vacuum_thread_loop(&repr_weak, &command_rx, vacuum_interval);
764 })
765 .expect("failed to spawn vacuum thread");
766
767 Self {
768 inner: repr,
769 _vacuum_thread: Some(Arc::new(VacuumThreadHandle {
770 command_sender: command_tx,
771 join_handle: Some(join_handle),
772 })),
773 }
774 }
775
776 /// Manually triggers a vacuum operation to clean up dead weak references.
777 ///
778 /// This method removes all dead [`Weak`] references from the interner's
779 /// internal hash maps. Dead references occur when all [`Interned<T>`]
780 /// handles for a value have been dropped.
781 ///
782 /// # When to Use
783 ///
784 /// - After a large batch of interned values has been dropped
785 /// - Before serializing the interner state (to reduce size)
786 /// - In memory-constrained environments
787 /// - When not using a background vacuum thread
788 ///
789 /// # Performance
790 ///
791 /// - Acquires write locks on all shards sequentially
792 /// - Time complexity: O(total entries across all shards)
793 /// - May briefly block other intern/lookup operations
794 ///
795 /// # Example
796 ///
797 /// ```ignore
798 /// // Intern many temporary values
799 /// for i in 0..10000 {
800 /// let _ = interner.intern(format!("temp_{}", i));
801 /// }
802 /// // All Interned handles dropped, but weak refs remain
803 ///
804 /// // Clean up dead references
805 /// interner.vacuum();
806 /// ```
807 pub fn vacuum(&self) { vacuum_shards(&self.inner.shards); }
808
809 /// Requests the background vacuum thread to run immediately.
810 ///
811 /// If the interner was created with
812 /// [`new_with_vacuum`](Self::new_with_vacuum), this signals the
813 /// background thread to perform a vacuum cycle as soon as
814 /// possible, rather than waiting for the next scheduled interval.
815 ///
816 /// If no background vacuum thread exists (interner created with
817 /// [`new`](Self::new)), this method does nothing.
818 ///
819 /// # Non-blocking
820 ///
821 /// This method returns immediately after signaling the thread. It does not
822 /// wait for the vacuum operation to complete. If you need synchronous
823 /// cleanup, use [`vacuum`](Self::vacuum) instead.
824 ///
825 /// # Example
826 ///
827 /// ```ignore
828 /// // Request early vacuum (non-blocking)
829 /// interner.request_vacuum();
830 ///
831 /// // Or use vacuum() for synchronous cleanup
832 /// interner.vacuum();
833 /// ```
834 pub fn request_vacuum(&self) {
835 if let Some(sender) = &self.inner.vacuum_command {
836 // Send trigger command; unbounded channel won't block
837 let _ = sender.send(VacuumCommand::Trigger);
838 }
839 }
840
841 /// Computes the 128-bit stable hash of a value.
842 ///
843 /// This method uses the interner's configured hasher to compute a
844 /// deterministic hash. The same value will always produce the same hash
845 /// within this interner instance (and across instances using the same
846 /// hasher configuration).
847 ///
848 /// # Parameters
849 ///
850 /// * `value` - A reference to the value to hash. Must implement
851 /// [`StableHash`].
852 ///
853 /// # Returns
854 ///
855 /// A [`Compact128`] representing the 128-bit stable hash of the value.
856 ///
857 /// # Use Cases
858 ///
859 /// This method is primarily used internally but can be useful for:
860 /// - Computing hashes for caching or indexing
861 /// - Debugging hash collisions
862 /// - Implementing custom interning strategies
863 ///
864 /// # Example
865 ///
866 /// ```ignore
867 /// let hash1 = interner.hash_128(&"hello".to_string());
868 /// let hash2 = interner.hash_128(&"hello".to_string());
869 /// assert_eq!(hash1, hash2); // Same value, same hash
870 ///
871 /// let hash3 = interner.hash_128(&"world".to_string());
872 /// assert_ne!(hash1, hash3); // Different value, different hash
873 /// ```
874 pub fn hash_128<T: StableHash + ?Sized>(&self, value: &T) -> Compact128 {
875 let hash_u128 = (self.inner.stable_hash_fn)(
876 &*self.inner.hasher_builder_erased,
877 &value,
878 );
879
880 hash_u128.into()
881 }
882
883 fn obtain_read_shard<T: Identifiable + Send + Sync + 'static + ?Sized>(
884 &self,
885 ) -> MappedRwLockReadGuard<'_, Shard> {
886 let shard_amount = self.inner.shards.shard_amount();
887 let stable_type_id = T::STABLE_TYPE_ID;
888
889 loop {
890 let shard_index =
891 self.inner.shards.shard_index(stable_type_id.low());
892 let shard = self.inner.shards.read_shard(shard_index);
893
894 // fast path, obtain read lock if shard exists
895 if let Ok(exist) =
896 RwLockReadGuard::try_map(shard, |x| x.get(&stable_type_id))
897 {
898 return exist;
899 }
900
901 // slow path, need to create the shard, the `shard` has already
902 // dropped here
903 let mut write_shard = self.inner.shards.write_shard(shard_index);
904
905 if let Entry::Vacant(entry) = write_shard.entry(stable_type_id) {
906 entry.insert(Shard {
907 typed_shard: {
908 Box::new(TypedShard::<T>::new(shard_amount, |_| {
909 HashMap::default()
910 }))
911 },
912 vacuum_fn: vacuum_shard::<T>,
913 });
914 }
915 }
916 }
917
918 /// Retrieves an interned value by its 128-bit hash, if it exists and is
919 /// alive.
920 ///
921 /// This method looks up a value that was previously interned and still has
922 /// at least one active [`Interned<T>`] handle. It's primarily used during
923 /// deserialization to resolve hash references back to actual values.
924 ///
925 /// # Parameters
926 ///
927 /// * `hash_128` - The 128-bit stable hash of the value to retrieve.
928 ///
929 /// # Type Parameter
930 ///
931 /// * `T` - The expected type of the value. Must match the type that was
932 /// originally interned.
933 ///
934 /// # Returns
935 ///
936 /// * `Some(Interned<T>)` - If a value with this hash exists, is still alive
937 /// (has active handles), and has the correct type
938 /// * `None` - If:
939 /// - No value with this hash exists
940 /// - The value has been deallocated (no active handles)
941 /// - Type mismatch (requested type doesn't match interned type)
942 ///
943 /// # Type Safety
944 ///
945 /// The method uses [`StableTypeID`] to ensure type safety. Attempting to
946 /// retrieve a value with the wrong type returns `None`, preventing type
947 /// confusion even if hash values match.
948 ///
949 /// # Serialization Context
950 ///
951 /// During serialization, [`Interned<T>`] values are encoded as:
952 /// 1. **First occurrence**: Full value + hash
953 /// 2. **Subsequent occurrences**: Hash reference only
954 ///
955 /// During deserialization, hash references are resolved using this method
956 /// to reconstruct the original value structure.
957 ///
958 /// # Example
959 ///
960 /// ```ignore
961 /// // Intern a value
962 /// let original = interner.intern("data".to_string());
963 /// let hash = interner.hash_128(&*original);
964 ///
965 /// // Retrieve by hash
966 /// let retrieved = interner.get_from_hash::<String>(hash)
967 /// .expect("value should still be alive");
968 ///
969 /// assert!(Arc::ptr_eq(&original.0, &retrieved.0));
970 ///
971 /// // After all handles drop, lookup returns None
972 /// drop(original);
973 /// drop(retrieved);
974 /// assert!(interner.get_from_hash::<String>(hash).is_none());
975 /// ```
976 #[must_use]
977 pub fn get_from_hash<T: Identifiable + Send + Sync + 'static + ?Sized>(
978 &self,
979 hash_128: Compact128,
980 ) -> Option<Interned<T>> {
981 let read_shard = self.obtain_read_shard::<T>();
982 let typed_shard = read_shard
983 .typed_shard
984 .downcast_ref::<TypedShard<T>>()
985 .expect("should be correct type");
986
987 let shard_index = typed_shard.shard_index(hash_128.low());
988 let read_lock = typed_shard.read_shard(shard_index);
989
990 read_lock
991 .get(&hash_128)
992 .and_then(std::sync::Weak::upgrade)
993 .map(|arc| Interned(arc))
994 }
995
996 /// Interns a value, returning a reference-counted handle to the shared
997 /// allocation.
998 ///
999 /// If an equal value (as determined by stable hash) has already been
1000 /// interned and is still alive, this method returns a handle to the
1001 /// existing allocation. Otherwise, it stores the new value and returns a
1002 /// handle to it.
1003 ///
1004 /// # Equality Semantics
1005 ///
1006 /// Values are considered equal if:
1007 /// 1. They have the same type (same [`StableTypeID`])
1008 /// 2. They have the same stable hash (128-bit)
1009 ///
1010 /// **Important**: This uses **hash equality**, not structural equality
1011 /// (`PartialEq`). While 128-bit hash collisions are astronomically
1012 /// unlikely, they are theoretically possible. In practice, this is not a
1013 /// concern for most applications.
1014 ///
1015 /// # Type Parameters
1016 ///
1017 /// * `T` - The type of value to intern. Must implement:
1018 /// - [`StableHash`]: For computing the deterministic content hash
1019 /// - [`Identifiable`]: For the stable type ID
1020 /// - `Send + Sync + 'static`: For safe sharing across threads
1021 ///
1022 /// # Parameters
1023 ///
1024 /// * `value` - The value to intern. Takes ownership.
1025 ///
1026 /// # Returns
1027 ///
1028 /// An [`Interned<T>`] handle to the interned value. If an equal value was
1029 /// already interned, the returned handle points to the existing allocation.
1030 ///
1031 /// # Thread Safety
1032 ///
1033 /// This method is fully thread-safe and can be called concurrently:
1034 /// - If two threads intern the same value simultaneously, only one
1035 /// allocation is created
1036 /// - Both threads receive handles to the same allocation
1037 /// - The losing thread's value is dropped
1038 ///
1039 /// # Performance
1040 ///
1041 /// * **Time complexity**: O(1) expected, O(n) worst case (hash table)
1042 /// * **Hashing cost**: O(size of value) - entire value is hashed
1043 /// * **Lock contention**: Only affects the specific shard for this hash
1044 /// * **Memory**: Reuses existing allocation if value already interned
1045 ///
1046 /// # Trade-offs
1047 ///
1048 /// **Slower than `Arc::new`** because:
1049 /// - Must hash the entire value
1050 /// - Requires hash table lookup
1051 /// - May need to acquire locks
1052 ///
1053 /// **Saves memory when**:
1054 /// - Many duplicate values exist
1055 /// - Values are large
1056 /// - Long-lived values with many references
1057 ///
1058 /// # Example
1059 ///
1060 /// ```ignore
1061 /// // First intern - creates new allocation
1062 /// let s1 = interner.intern("hello".to_string());
1063 ///
1064 /// // Second intern - reuses existing allocation
1065 /// let s2 = interner.intern("hello".to_string());
1066 ///
1067 /// // Same pointer, same allocation
1068 /// assert!(Arc::ptr_eq(&s1.0, &s2.0));
1069 ///
1070 /// // Different value - new allocation
1071 /// let s3 = interner.intern("world".to_string());
1072 /// assert!(!Arc::ptr_eq(&s1.0, &s3.0));
1073 /// ```
1074 pub fn intern<T: StableHash + Identifiable + Send + Sync + 'static>(
1075 &self,
1076 value: T,
1077 ) -> Interned<T> {
1078 let hash_128 = self.hash_128(&value);
1079 let read_shard = self.obtain_read_shard::<T>();
1080 let typed_shard = read_shard
1081 .typed_shard
1082 .downcast_ref::<TypedShard<T>>()
1083 .expect("should be correct type");
1084
1085 let shard_index = typed_shard.shard_index(hash_128.low());
1086
1087 // First, try to find an existing interned value
1088 {
1089 let read_shard = typed_shard.read_shard(shard_index);
1090
1091 if let Some(arc) =
1092 read_shard.get(&hash_128).and_then(std::sync::Weak::upgrade)
1093 {
1094 return Interned(arc);
1095 }
1096 }
1097
1098 // Not found, so insert a new one
1099 {
1100 let mut write_shard = typed_shard.write_shard(shard_index);
1101
1102 match write_shard.entry(hash_128) {
1103 // double check in case another thread inserted it
1104 std::collections::hash_map::Entry::Occupied(mut entry) => {
1105 if let Some(arc) = entry.get().upgrade() {
1106 return Interned(arc);
1107 }
1108
1109 // The weak reference is dead, we can replace it
1110 let arc = Arc::new(value);
1111 let weak = Arc::downgrade(&arc);
1112
1113 entry.insert(weak);
1114
1115 Interned(arc)
1116 }
1117
1118 std::collections::hash_map::Entry::Vacant(entry) => {
1119 let arc = Arc::new(value);
1120 let weak = Arc::downgrade(&arc);
1121
1122 entry.insert(weak);
1123
1124 Interned(arc)
1125 }
1126 }
1127 }
1128 }
1129
1130 /// Interns an unsized value, returning a reference-counted handle to the
1131 /// shared allocation.
1132 ///
1133 /// This method is similar to [`intern`](Self::intern) but supports unsized
1134 /// types like `str` and `[T]`. The value is provided as a sized type `Q`
1135 /// that can be borrowed as `T` and converted into `Arc<T>`.
1136 ///
1137 /// # Type Parameters
1138 ///
1139 /// * `T` - The unsized type to intern (e.g., `str`, `[u8]`). Must
1140 /// implement:
1141 /// - [`StableHash`]: For computing the deterministic content hash
1142 /// - [`Identifiable`]: For the stable type ID
1143 /// - `Send + Sync + 'static`: For safe sharing across threads
1144 ///
1145 /// * `Q` - The sized type that owns the value. Must implement:
1146 /// - [`Borrow<T>`]: To access the unsized value for hashing
1147 /// - `Send + Sync + 'static`: For safe sharing across threads
1148 /// - `Arc<T>: From<Q>`: For efficient conversion to `Arc<T>`
1149 ///
1150 /// # Common Type Combinations
1151 ///
1152 /// | Unsized `T` | Sized `Q` options |
1153 /// |-------------|-------------------|
1154 /// | `str` | `String`, `Box<str>`, `Cow<'static, str>` |
1155 /// | `[u8]` | `Vec<u8>`, `Box<[u8]>` |
1156 /// | `[T]` | `Vec<T>`, `Box<[T]>` |
1157 ///
1158 /// # Example
1159 ///
1160 /// ```ignore
1161 /// // Intern strings as str slices
1162 /// let s1: Interned<str> = interner.intern_unsized("hello".to_string());
1163 /// let s2: Interned<str> = interner.intern_unsized("hello".to_string());
1164 ///
1165 /// // Same allocation
1166 /// assert!(Arc::ptr_eq(&s1.0, &s2.0));
1167 ///
1168 /// // Transparent access via Deref
1169 /// assert_eq!(&*s1, "hello");
1170 /// assert_eq!(s1.len(), 5);
1171 ///
1172 /// // Works with byte slices too
1173 /// let bytes: Interned<[u8]> = interner.intern_unsized(vec![1, 2, 3]);
1174 /// assert_eq!(&*bytes, &[1, 2, 3]);
1175 /// ```
1176 ///
1177 /// # Performance
1178 ///
1179 /// Same as [`intern`](Self::intern) - O(1) expected time with hashing
1180 /// overhead proportional to value size.
1181 pub fn intern_unsized<
1182 T: StableHash + Identifiable + Send + Sync + 'static + ?Sized,
1183 Q: Borrow<T> + Send + Sync + 'static,
1184 >(
1185 &self,
1186 value: Q,
1187 ) -> Interned<T>
1188 where
1189 Arc<T>: From<Q>,
1190 {
1191 let hash_128 = self.hash_128(value.borrow());
1192 let read_shard = self.obtain_read_shard::<T>();
1193 let typed_shard = read_shard
1194 .typed_shard
1195 .downcast_ref::<TypedShard<T>>()
1196 .expect("should be correct type");
1197
1198 let shard_index = typed_shard.shard_index(hash_128.low());
1199
1200 // First, try to find an existing interned value
1201 {
1202 let read_shard = typed_shard.read_shard(shard_index);
1203
1204 if let Some(arc) =
1205 read_shard.get(&hash_128).and_then(std::sync::Weak::upgrade)
1206 {
1207 return Interned(arc);
1208 }
1209 }
1210
1211 // Not found, so insert a new one
1212 {
1213 let mut write_shard = typed_shard.write_shard(shard_index);
1214
1215 match write_shard.entry(hash_128) {
1216 // double check in case another thread inserted it
1217 std::collections::hash_map::Entry::Occupied(mut entry) => {
1218 if let Some(arc) = entry.get().upgrade() {
1219 return Interned(arc);
1220 }
1221
1222 // The weak reference is dead, we can replace it
1223 let arc = Arc::from(value);
1224 let weak = Arc::downgrade(&arc);
1225
1226 entry.insert(weak);
1227
1228 Interned(arc)
1229 }
1230
1231 std::collections::hash_map::Entry::Vacant(entry) => {
1232 let arc = Arc::from(value);
1233 let weak = Arc::downgrade(&arc);
1234
1235 entry.insert(weak);
1236
1237 Interned(arc)
1238 }
1239 }
1240 }
1241 }
1242}
1243
1244impl std::fmt::Debug for Repr {
1245 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1246 f.debug_struct("Interner").finish_non_exhaustive()
1247 }
1248}
1249
1250fn vacuum_shard<T: ?Sized + 'static>(shard: &dyn Any) {
1251 let typed_shard = shard
1252 .downcast_ref::<TypedShard<T>>()
1253 .expect("invalid shard type for vacuum");
1254
1255 for mut write_shard in typed_shard.iter_write_shards() {
1256 write_shard.retain(|_, weak_value| weak_value.upgrade().is_some());
1257 }
1258}
1259
1260/// Vacuum all shards in the interner, removing dead weak references.
1261fn vacuum_shards(shards: &WholeShard) {
1262 for read_shard in shards.iter_read_shards() {
1263 for shard in read_shard.values() {
1264 (shard.vacuum_fn)(&*shard.typed_shard);
1265 }
1266 }
1267}
1268
1269/// The main loop for the background vacuum thread.
1270fn vacuum_thread_loop(
1271 repr_weak: &Weak<Repr>,
1272 command_rx: &Receiver<VacuumCommand>,
1273 vacuum_interval: Duration,
1274) {
1275 loop {
1276 // Wait for a command with timeout as the vacuum interval
1277 match command_rx.recv_timeout(vacuum_interval) {
1278 Ok(VacuumCommand::Stop)
1279 | Err(crossbeam_channel::RecvTimeoutError::Disconnected) => {
1280 // Stop signal received or channel disconnected, exit the loop
1281 return;
1282 }
1283 Ok(VacuumCommand::Trigger)
1284 | Err(crossbeam_channel::RecvTimeoutError::Timeout) => {
1285 // Trigger signal received, run vacuum immediately
1286 }
1287 }
1288
1289 // Try to upgrade the weak reference to run vacuum
1290 if let Some(repr) = repr_weak.upgrade() {
1291 vacuum_shards(&repr.shards);
1292 } else {
1293 // The interner has been dropped, exit the loop
1294 return;
1295 }
1296 }
1297}
1298
1299impl Repr {
1300 /// Creates a new interner with specified sharding and stable hasher.
1301 ///
1302 /// # Parameters
1303 ///
1304 /// * `shard_amount` - The number of shards for concurrent access. Each
1305 /// shard has its own lock, so more shards reduce contention at the cost
1306 /// of memory overhead.
1307 ///
1308 /// **Recommended values:**
1309 /// - Single-threaded: 1-4 shards
1310 /// - Multi-threaded (typical): 16-32 shards
1311 /// - High contention: 64-128 shards
1312 /// - Powers of 2 work best (efficient bit masking)
1313 ///
1314 /// * `hasher_builder` - Builder for creating stable hashers. Must produce
1315 /// deterministic 128-bit hashes. The same input must always produce the
1316 /// same hash across all program executions for serialization to work
1317 /// correctly.
1318 ///
1319 /// # Panics
1320 ///
1321 /// Panics if `shard_amount` is not a power of two.
1322 ///
1323 /// # Example
1324 ///
1325 /// ```ignore
1326 /// use qbice_stable_hash::BuildStableHasher128;
1327 ///
1328 /// // Create interner with 16 shards
1329 /// let interner = Interner::new(16, BuildStableHasher128::default());
1330 /// ```
1331 pub fn new<S: BuildStableHasher + Send + Sync + 'static>(
1332 shard_amount: usize,
1333 hasher_builder: S,
1334 ) -> Self
1335 where
1336 <S as BuildStableHasher>::Hasher: StableHasher<Hash = u128>,
1337 {
1338 Self {
1339 shards: Sharded::new(shard_amount, |_| HashMap::default()),
1340 hasher_builder_erased: Box::new(hasher_builder),
1341 stable_hash_fn: stable_hash::<S>,
1342 vacuum_command: None,
1343 }
1344 }
1345
1346 /// Creates a new interner with vacuum thread support.
1347 pub fn new_with_vacuum<S: BuildStableHasher + Send + Sync + 'static>(
1348 shard_amount: usize,
1349 hasher_builder: S,
1350 vacuum_command: Sender<VacuumCommand>,
1351 ) -> Self
1352 where
1353 <S as BuildStableHasher>::Hasher: StableHasher<Hash = u128>,
1354 {
1355 Self {
1356 shards: Sharded::new(shard_amount, |_| HashMap::default()),
1357 hasher_builder_erased: Box::new(hasher_builder),
1358 stable_hash_fn: stable_hash::<S>,
1359 vacuum_command: Some(vacuum_command),
1360 }
1361 }
1362}
1363
1364#[cfg(test)]
1365mod test;