atum 0.1.0

Lock-free bidirectional Atom Table, optimized for multi-threaded workloads
Documentation
#![doc = include_str!("../README.md")]
#![deny(missing_docs, clippy::all)]
#![allow(clippy::needless_doctest_main)]

mod profiling;
mod unwrap_dbg;

use crate::unwrap_dbg::UnwrapDbg;

use std::hash::Hash;
use std::sync::atomic::{AtomicU64, Ordering};

use dashmap::DashMap;
use ecow::EcoString;
use papaya::LocalGuard;
use rustc_hash::FxBuildHasher;

impl nohash_hasher::IsEnabled for AtomId {}

type NoHashDashMap<K, V> = DashMap<K, V, nohash_hasher::BuildNoHashHasher<K>>;
type FxHashPapayaHashMap<K, V> = papaya::HashMap<K, V, rustc_hash::FxBuildHasher>;
type StrToIdMapRef<'map, 'g> =
    papaya::HashMapRef<'map, EcoString, AtomId, FxBuildHasher, LocalGuard<'g>>;

pub mod prelude {
    //! Public re-exports for convenience.
    pub use crate::{AtomId, AtomTable};
}

/// Strongly-typed atom id.
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)]
pub struct AtomId(pub u64);

/// Lock-free, bidirectional string atom table
#[derive(Debug, Default)]
pub struct AtomTable {
    atom_id_to_str: NoHashDashMap<AtomId, EcoString>,
    str_to_atom_id: FxHashPapayaHashMap<EcoString, AtomId>,
    next_atom_id: AtomicU64,
}

impl Clone for AtomTable {
    fn clone(&self) -> Self {
        Self {
            str_to_atom_id: self.str_to_atom_id.clone(),
            atom_id_to_str: self.atom_id_to_str.clone(),
            next_atom_id: AtomicU64::new(self.next_atom_id.load(Ordering::Relaxed)),
        }
    }
}

impl AtomTable {
    /// Create new atom table.
    #[inline]
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Create with capacity.
    #[inline]
    #[must_use]
    pub fn with_capacity(capacity: usize) -> Self {
        Self {
            str_to_atom_id: FxHashPapayaHashMap::with_capacity_and_hasher(capacity, FxBuildHasher),
            atom_id_to_str: NoHashDashMap::with_capacity_and_hasher(
                capacity,
                nohash_hasher::BuildNoHashHasher::default(),
            ),
            next_atom_id: AtomicU64::new(0),
        }
    }

    /// Intern a string. Fast-path is zero-allocation for existing strings.
    ///
    /// ```rust
    /// use atum::AtomTable;
    /// let tbl = AtomTable::new();
    /// let atom = tbl.intern("Hello, Sailor!");
    /// assert_eq!(tbl.lookup_ref(atom).as_ref(), "Hello, Sailor!");
    /// assert_eq!(tbl.intern("Hello, Sailor!"), atom);
    /// ```
    ///
    /// Returns an `AtomId` unique for the interned string.
    #[inline(always)]
    pub fn intern(&self, s: &str) -> AtomId {
        let _span = tracy_span!("AtomTable::intern");

        let g = self.str_to_atom_id.pin();

        // zero-alloc fast path
        if let Some(&id) = g.get(s) {
            return id;
        }

        self.intern_cold(s, &g)
    }

    /// Intern a string with a pre-pinned guard (for batch operations).
    ///
    /// This is more efficient when interning multiple strings in sequence
    /// as it avoids repeatedly pinning/unpinning the guard.
    #[inline]
    pub fn intern_with_guard(&self, s: &str, g: &StrToIdMapRef<'_, '_>) -> AtomId {
        if let Some(&id) = g.get(s) {
            return id;
        }
        self.intern_cold(s, g)
    }

    /// Intern multiple strings in batch with optimal performance.
    ///
    /// This is the fastest way to intern many strings at once.
    #[inline]
    pub fn intern_batch<'a, I>(&self, strings: I) -> Vec<AtomId>
    where
        I: IntoIterator<Item = &'a str>,
        I::IntoIter: ExactSizeIterator,
    {
        let iter = strings.into_iter();
        let mut result = Vec::with_capacity(iter.len());
        let g = self.str_to_atom_id.pin();

        for s in iter {
            result.push(self.intern_with_guard(s, &g));
        }

        result
    }

    #[cold]
    #[inline(never)]
    fn intern_cold(&self, s: &str, g: &StrToIdMapRef<'_, '_>) -> AtomId {
        let _span = tracy_span!("AtomTable::intern_cold");

        let key = EcoString::from(s);
        let key_ptr: *const EcoString = &key;

        *g.get_or_insert_with(key, || {
            let id = AtomId(self.next_atom_id.fetch_add(1, Ordering::Relaxed));

            // Safety: key_ptr is valid because key is still in scope
            unsafe {
                self.atom_id_to_str.insert(id, EcoString::clone(&*key_ptr));
            }

            id
        })
    }

    /// Pin a guard for batch operations.
    ///
    /// Use this with `intern_with_guard` when interning many strings:
    /// ```rust
    /// use atum::AtomTable;
    /// let tbl = AtomTable::new();
    /// let guard = tbl.pin();
    /// let strings: &[&str] = &[
    ///     "unfortunately", "there's", "a",
    ///     "radio", "connected", "to", "my", "brain"
    /// ];
    /// for s in strings {
    ///     tbl.intern_with_guard(s, &guard);
    /// }
    /// ```
    #[inline]
    pub fn pin(&self) -> StrToIdMapRef<'_, '_> {
        self.str_to_atom_id.pin()
    }

    /// Lookup by id and return an owned `String` (allocates).
    #[inline]
    #[must_use]
    pub fn lookup_owned(&self, id: AtomId) -> EcoString {
        EcoString::clone(&*self.atom_id_to_str.get(&id).unwrap_dbg())
    }

    /// Zero-allocation lookup returning a borrow that holds a read lock.
    #[inline]
    #[must_use]
    pub fn lookup_ref(&self, id: AtomId) -> dashmap::mapref::one::Ref<'_, AtomId, EcoString> {
        self.atom_id_to_str.get(&id).unwrap_dbg()
    }

    /// Check if a string is already interned without inserting it.
    ///
    /// Returns `Some(AtomId)` if the string exists, `None` otherwise.
    #[inline]
    #[must_use]
    pub fn try_lookup_id(&self, s: &str) -> Option<AtomId> {
        self.str_to_atom_id.pin().get(s).copied()
    }

    /// Check if a string is already interned using a pre-pinned guard.
    ///
    /// Faster than `try_lookup_id` when you already have a guard.
    #[inline]
    #[must_use]
    pub fn try_lookup_id_with_guard(&self, s: &str, g: &StrToIdMapRef<'_, '_>) -> Option<AtomId> {
        g.get(s).copied()
    }

    /// Returns `AtomId` if the string exists, panics otherwise.
    #[inline]
    #[must_use]
    pub fn lookup_id(&self, s: &str) -> AtomId {
        self.try_lookup_id(s).unwrap_dbg()
    }

    /// Returns `AtomId` if the string exists, panics otherwise.
    ///
    /// Faster than `lookup_id` when you already have a guard.
    #[inline]
    #[must_use]
    pub fn lookup_id_with_guard(&self, s: &str, g: &StrToIdMapRef<'_, '_>) -> AtomId {
        self.try_lookup_id_with_guard(s, g).unwrap_dbg()
    }

    /// Number of interned strings.
    #[inline]
    #[must_use]
    pub fn len(&self) -> usize {
        self.str_to_atom_id.len()
    }

    /// Is empty
    #[inline]
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.str_to_atom_id.is_empty()
    }

    /// Clear all interned strings (resets ids)
    #[inline]
    pub fn clear(&self) {
        self.str_to_atom_id.pin().clear();
        self.atom_id_to_str.clear();
        self.next_atom_id.store(0, Ordering::Relaxed);
    }

    /// Iterate over all interned strings.
    ///
    /// Returns an iterator of `(AtomId, EcoString)` pairs.
    /// The order is unspecified.
    #[inline]
    pub fn iter(&self) -> impl Iterator<Item = (AtomId, EcoString)> + '_ {
        self.atom_id_to_str
            .iter()
            .map(|r| (*r.key(), EcoString::clone(&*r)))
    }
}