#![cfg_attr(all(feature = "bench", test), feature(test))]
#![doc(html_root_url = "https://docs.rs/crate/arc-string-interner/0.1.0")]
#![deny(missing_docs)]
#[cfg(all(feature = "bench", test))]
extern crate test;
#[cfg(test)]
mod tests;
#[cfg(all(feature = "bench", test))]
mod benches;
use array_init::array_init;
use parking_lot::{RwLock, RwLockUpgradableReadGuard, RwLockWriteGuard};
use std::{
collections::{hash_map::RandomState, HashMap},
hash::{BuildHasher, Hash, Hasher},
num::NonZeroU32,
sync::Arc,
};
pub trait Symbol: Copy + Ord + Eq {
fn from_usize(val: usize) -> Self;
fn to_usize(self) -> usize;
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Sym(NonZeroU32);
impl Symbol for Sym {
fn from_usize(val: usize) -> Self {
assert!(
val < u32::MAX as usize,
"Symbol value {} is too large and not supported by `string_interner::Sym` type",
val
);
Sym(NonZeroU32::new((val + 1) as u32).unwrap_or_else(|| {
unreachable!("Should never fail because `val + 1` is nonzero and `<= u32::MAX`")
}))
}
fn to_usize(self) -> usize {
(self.0.get() as usize) - 1
}
}
impl Symbol for usize {
fn from_usize(val: usize) -> Self {
val
}
fn to_usize(self) -> usize {
self
}
}
#[derive(Debug, Copy, Clone, Eq)]
struct InternalStrRef(*const str);
impl InternalStrRef {
fn from_str(val: &str) -> Self {
InternalStrRef(val as *const str)
}
fn as_str(&self) -> &str {
unsafe { &*self.0 }
}
}
impl<T> From<T> for InternalStrRef
where
T: AsRef<str>,
{
fn from(val: T) -> Self {
InternalStrRef::from_str(val.as_ref())
}
}
impl Hash for InternalStrRef {
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_str().hash(state)
}
}
impl PartialEq for InternalStrRef {
fn eq(&self, other: &InternalStrRef) -> bool {
self.as_str() == other.as_str()
}
}
pub type DefaultStringInterner = StringInterner<Sym, RandomState, 8>;
type Shard<S, H> = RwLock<(HashMap<InternalStrRef, S, H>, Vec<Arc<str>>)>;
#[derive(Debug)]
pub struct StringInterner<S, H, const N: usize>
where
S: Symbol,
H: BuildHasher,
{
map_values: [Shard<S, H>; N],
hash_builder: H,
}
impl Default for StringInterner<Sym, RandomState, 8> {
#[inline]
fn default() -> Self {
StringInterner::<Sym, RandomState, 8>::new()
}
}
unsafe impl<S, H, const N: usize> Send for StringInterner<S, H, N>
where
S: Symbol + Send,
H: BuildHasher,
{
}
unsafe impl<S, H, const N: usize> Sync for StringInterner<S, H, N>
where
S: Symbol + Sync,
H: BuildHasher,
{
}
impl<S, const N: usize> StringInterner<S, RandomState, N>
where
S: Symbol,
{
#[inline]
pub fn new() -> Self {
StringInterner {
map_values: array_init(|_| RwLock::new((HashMap::new(), Vec::new()))),
hash_builder: RandomState::new(),
}
}
#[inline]
pub fn with_capacity(cap: usize) -> Self {
StringInterner {
map_values: array_init(|_| {
RwLock::new((HashMap::with_capacity(cap / N), Vec::with_capacity(cap)))
}),
hash_builder: RandomState::new(),
}
}
}
type ShardReadGuard<'a, S, H> =
RwLockWriteGuard<'a, (HashMap<InternalStrRef, S, H>, Vec<Arc<str>>)>;
impl<S, H, const N: usize> StringInterner<S, H, N>
where
S: Symbol,
H: BuildHasher + Clone,
{
pub fn sym_to_index(sym: S) -> usize {
Symbol::to_usize(sym) & 0x00FFFFFF
}
pub fn sym_to_shard_index(sym: S) -> usize {
Symbol::to_usize(sym) >> 24
}
#[inline]
pub fn with_hasher(hash_builder: H) -> Self {
StringInterner {
map_values: array_init(|_| {
RwLock::new((HashMap::with_hasher(hash_builder.clone()), Vec::new()))
}),
hash_builder,
}
}
#[inline]
pub fn with_capacity_and_hasher(cap: usize, hash_builder: H) -> Self {
StringInterner {
map_values: array_init(|_| {
RwLock::new((
HashMap::with_capacity_and_hasher(cap / N, hash_builder.clone()),
Vec::with_capacity(cap),
))
}),
hash_builder,
}
}
fn hash(&self, s: &str) -> u64 {
let mut hasher = self.hash_builder.build_hasher();
hasher.write(s.as_bytes());
hasher.finish()
}
fn shard_index(hash: u64) -> u64 {
((hash as usize) % N) as u64
}
#[inline]
pub fn get_or_intern<T>(&self, val: T) -> S
where
T: Into<String> + AsRef<str>,
{
let shard_index = Self::shard_index(self.hash(val.as_ref()));
let map_values = self.map_values[shard_index as usize].upgradable_read();
match map_values.0.get(&val.as_ref().into()) {
Some(e) => *e,
None => self.intern_into_shard(
shard_index,
RwLockUpgradableReadGuard::upgrade(map_values),
val,
),
}
}
fn intern_into_shard<T>(
&self,
shard_index: u64,
mut map_values_shard: ShardReadGuard<S, H>,
new_val: T,
) -> S
where
T: Into<String> + AsRef<str>,
{
let new_boxed_val: Arc<str> = new_val.into().into();
let new_ref: InternalStrRef = new_boxed_val.as_ref().into();
let new_id = {
let new_id: S = Self::make_symbol(shard_index, map_values_shard.1.len());
map_values_shard.1.push(new_boxed_val);
new_id
};
map_values_shard.0.insert(new_ref, new_id);
new_id
}
fn make_symbol(shard_index: u64, element_index: usize) -> S {
S::from_usize(((shard_index as usize) << 24) | element_index)
}
#[inline]
pub fn resolve(&self, symbol: S) -> Option<Arc<str>> {
self.map_values[Self::sym_to_shard_index(symbol)]
.read()
.1
.get(Self::sym_to_index(symbol))
.cloned()
}
#[inline]
pub unsafe fn resolve_unchecked(&self, symbol: S) -> Arc<str> {
self.map_values[Self::sym_to_shard_index(symbol)]
.read()
.1
.get_unchecked(Self::sym_to_index(symbol))
.clone()
}
#[inline]
pub fn get<T>(&self, val: T) -> Option<S>
where
T: AsRef<str>,
{
let shard_index = Self::shard_index(self.hash(val.as_ref()));
self.map_values[shard_index as usize]
.read()
.0
.get(&val.as_ref().into())
.cloned()
}
#[inline]
pub fn len(&self) -> usize {
self.map_values
.iter()
.map(RwLock::read)
.collect::<Vec<_>>()
.into_iter()
.rfold(0, |a, b| a + b.1.len())
}
#[inline]
pub fn is_empty(&self) -> bool {
self.map_values
.iter()
.map(RwLock::read)
.collect::<Vec<_>>()
.into_iter()
.all(|v| v.1.is_empty())
}
pub fn shrink_to_fit(&mut self) {
self.map_values.iter().for_each(|m| {
let mut m = m.write();
m.0.shrink_to_fit();
m.1.shrink_to_fit()
});
}
}