#[cfg(not(feature = "spinlock"))]
use parking_lot::Mutex;
#[cfg(feature = "spinlock")]
use spin::Mutex;
use std::fmt;
use std::str::FromStr;
mod stringcache;
pub use stringcache::*;
#[cfg(feature = "serialization")]
pub mod serialization;
#[cfg(feature = "serialization")]
pub use serialization::DeserializedCache;
mod bumpalloc;
mod hash;
pub use hash::*;
use std::cmp::Ordering;
use std::hash::{Hash, Hasher};
use std::ptr::NonNull;
#[cfg_attr(
feature = "spinlock",
deprecated(
since = "0.9.0",
note = "spinlock was experimental and has now been deprecated for removal in 1.0, where parking_lot's Mutex will be the only synchronization primitive. Please do not use the 'spinlock' feature"
)
)]
#[cfg_attr(
feature = "fasthash",
deprecated(
since = "0.9.0",
note = "fasthash support is deprecated and will be removed in 1.0 as ahash in better in all situations."
)
)]
#[derive(Copy, Clone, PartialEq)]
#[repr(transparent)]
pub struct Ustr {
char_ptr: NonNull<u8>,
}
impl Ord for Ustr {
fn cmp(&self, other: &Self) -> Ordering {
self.as_str().cmp(other.as_str())
}
}
impl PartialOrd for Ustr {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
self.as_str().partial_cmp(other.as_str())
}
}
impl Ustr {
pub fn from(string: &str) -> Ustr {
#[cfg(feature = "hashcity")]
let hash = fasthash::city::hash64(string.as_bytes());
#[cfg(not(feature = "hashcity"))]
let hash = {
let mut hasher = ahash::AHasher::new_with_keys(123, 456);
hasher.write(string.as_bytes());
hasher.finish()
};
let mut sc = STRING_CACHE.0[whichbin(hash)].lock();
Ustr {
char_ptr: unsafe {
NonNull::new_unchecked(sc.insert(string, hash) as *mut _)
},
}
}
pub fn from_existing(string: &str) -> Option<Ustr> {
#[cfg(feature = "hashcity")]
let hash = fasthash::city::hash64(string.as_bytes());
#[cfg(not(feature = "hashcity"))]
let hash = {
let mut hasher = ahash::AHasher::new_with_keys(123, 456);
hasher.write(string.as_bytes());
hasher.finish()
};
let sc = STRING_CACHE.0[whichbin(hash)].lock();
sc.get_existing(string, hash).map(|ptr| Ustr {
char_ptr: unsafe { NonNull::new_unchecked(ptr as *mut _) },
})
}
pub fn as_str(&self) -> &'static str {
unsafe {
let len_ptr =
(self.char_ptr.as_ptr() as *const usize).offset(-1isize);
std::str::from_utf8_unchecked(std::slice::from_raw_parts(
self.char_ptr.as_ptr(),
std::ptr::read(len_ptr),
))
}
}
pub fn as_char_ptr(&self) -> *const std::os::raw::c_char {
self.char_ptr.as_ptr() as *const std::os::raw::c_char
}
pub fn as_cstr(&self) -> &std::ffi::CStr {
unsafe {
std::ffi::CStr::from_bytes_with_nul_unchecked(
std::slice::from_raw_parts(self.as_ptr(), self.len() + 1),
)
}
}
fn as_string_cache_entry(&self) -> &StringCacheEntry {
unsafe {
let len_ptr =
(self.char_ptr.as_ptr() as *const usize).offset(-1isize);
let sce_ptr = (len_ptr as *const u64).offset(-1isize)
as *const StringCacheEntry;
sce_ptr.as_ref().unwrap()
}
}
pub fn len(&self) -> usize {
self.as_string_cache_entry().len
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn precomputed_hash(&self) -> u64 {
self.as_string_cache_entry().hash
}
pub fn to_owned(&self) -> String {
self.as_str().to_owned()
}
}
unsafe impl Send for Ustr {}
unsafe impl Sync for Ustr {}
impl PartialEq<&str> for Ustr {
fn eq(&self, other: &&str) -> bool {
self.as_str() == *other
}
}
impl PartialEq<String> for Ustr {
fn eq(&self, other: &String) -> bool {
self.as_str() == other
}
}
impl Eq for Ustr {}
impl AsRef<str> for Ustr {
fn as_ref(&self) -> &str {
self.as_str()
}
}
impl FromStr for Ustr {
type Err = std::string::ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(Ustr::from(s))
}
}
impl From<&str> for Ustr {
fn from(s: &str) -> Ustr {
Ustr::from(s)
}
}
impl From<Ustr> for &'static str {
fn from(s: Ustr) -> &'static str {
s.as_str()
}
}
impl From<String> for Ustr {
fn from(s: String) -> Ustr {
Ustr::from(&s)
}
}
impl Default for Ustr {
fn default() -> Self {
Ustr::from("")
}
}
impl std::ops::Deref for Ustr {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
impl fmt::Display for Ustr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.as_str())
}
}
impl fmt::Debug for Ustr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "u!(\"{}\")", self.as_str())
}
}
#[allow(clippy::derive_hash_xor_eq)]
impl Hash for Ustr {
fn hash<H: Hasher>(&self, state: &mut H) {
self.precomputed_hash().hash(state);
}
}
#[doc(hidden)]
pub unsafe fn _clear_cache() {
for m in STRING_CACHE.0.iter() {
m.lock().clear();
}
}
pub fn total_allocated() -> usize {
STRING_CACHE
.0
.iter()
.map(|sc| {
let t = sc.lock().total_allocated();
t
})
.sum()
}
pub fn total_capacity() -> usize {
STRING_CACHE
.0
.iter()
.map(|sc| {
let t = sc.lock().total_capacity();
t
})
.sum()
}
#[inline]
pub fn ustr(s: &str) -> Ustr {
Ustr::from(s)
}
#[inline]
pub fn existing_ustr(s: &str) -> Option<Ustr> {
Ustr::from_existing(s)
}
pub fn get_cache() -> &'static Bins {
&*STRING_CACHE
}
pub fn num_entries() -> usize {
STRING_CACHE
.0
.iter()
.map(|sc| {
let t = sc.lock().num_entries();
t
})
.sum()
}
#[doc(hidden)]
pub fn num_entries_per_bin() -> Vec<usize> {
STRING_CACHE
.0
.iter()
.map(|sc| {
let t = sc.lock().num_entries();
t
})
.collect::<Vec<_>>()
}
pub fn string_cache_iter() -> StringCacheIterator {
let mut allocs = Vec::new();
for m in STRING_CACHE.0.iter() {
let sc = m.lock();
for a in &sc.old_allocs {
allocs.push((a.ptr(), a.end()));
}
let ptr = sc.alloc.ptr();
let end = sc.alloc.end();
if ptr != end {
allocs.push((sc.alloc.ptr(), sc.alloc.end()));
}
}
let current_ptr = allocs[0].0;
StringCacheIterator {
allocs,
current_alloc: 0,
current_ptr,
}
}
#[repr(transparent)]
pub struct Bins(pub(crate) [Mutex<StringCache>; NUM_BINS]);
#[cfg(test)]
mod tests {
use lazy_static::lazy_static;
use std::sync::Mutex;
lazy_static! {
static ref TEST_LOCK: Mutex<()> = Mutex::new(());
}
#[test]
fn it_works() {
let _t = TEST_LOCK.lock();
use super::ustr as u;
let u_hello = u("hello");
assert_eq!(u_hello, "hello");
let u_world = u("world");
assert_eq!(u_world, String::from("world"));
}
#[test]
fn empty_string() {
let _t = TEST_LOCK.lock();
use super::ustr as u;
unsafe {
super::_clear_cache();
}
let _empty = u("");
let empty = u("");
assert!(empty.as_str().is_empty());
assert_eq!(super::num_entries(), 1);
}
#[test]
fn c_str_works() {
let _t = TEST_LOCK.lock();
use super::ustr as u;
use std::ffi::CStr;
let s_fox = "The quick brown fox jumps over the lazy dog.";
let u_fox = u(s_fox);
let fox = unsafe { CStr::from_ptr(u_fox.as_char_ptr()) }
.to_string_lossy()
.into_owned();
assert_eq!(fox, s_fox);
let s_odys = "Τη γλώσσα μου Îδωσαν ελληνική";
let u_odys = u(s_odys);
let odys = unsafe { CStr::from_ptr(u_odys.as_char_ptr()) }
.to_string_lossy()
.into_owned();
assert_eq!(odys, s_odys);
}
#[test]
#[cfg_attr(miri, ignore)]
fn blns() {
let _t = TEST_LOCK.lock();
use super::{string_cache_iter, ustr as u};
use std::collections::HashSet;
unsafe { super::_clear_cache() };
let blns = include_str!("../data/blns.txt");
let mut hs = HashSet::new();
for s in blns.split_whitespace() {
hs.insert(s);
}
let mut us = Vec::new();
let mut ss = Vec::new();
for s in blns.split_whitespace().cycle().take(100_000) {
let u = u(s);
us.push(u);
ss.push(s.to_owned());
}
let mut hs_u = HashSet::new();
for s in string_cache_iter() {
hs_u.insert(s);
}
let diff: HashSet<_> = hs.difference(&hs_u).collect();
assert_eq!(super::num_entries(), hs.len());
assert_eq!(diff.len(), 0);
let nbs = super::num_entries_per_bin();
println!("{:?}", nbs);
println!("Total allocated: {}", super::total_allocated());
println!("Total capacity: {}", super::total_capacity());
println!(
"size of StringCache: {}",
std::mem::size_of::<super::StringCache>()
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn raft() {
let _t = TEST_LOCK.lock();
use super::ustr as u;
use std::sync::Arc;
let raft = include_str!("../data/raft-large-directories.txt");
let raft = Arc::new(
raft.split_whitespace()
.collect::<Vec<_>>()
.chunks(3)
.map(|s| {
if s.len() == 3 {
format!("{}/{}/{}", s[0], s[1], s[2])
} else {
s[0].to_owned()
}
})
.collect::<Vec<_>>(),
);
let s = raft.clone();
for _ in 0..600 {
let mut v = Vec::with_capacity(20_000);
unsafe { super::_clear_cache() };
for s in s.iter().cycle().take(20_000) {
v.push(u(s));
}
}
}
#[cfg(all(feature = "serialization", not(miri)))]
#[test]
fn serialization() {
let _t = TEST_LOCK.lock();
use super::{string_cache_iter, ustr as u};
use std::collections::HashSet;
unsafe { super::_clear_cache() };
let path = std::path::Path::new(
&std::env::var("CARGO_MANIFEST_DIR")
.expect("CARGO_MANIFEST_DIR not set"),
)
.join("data")
.join("blns.txt");
let blns = std::fs::read_to_string(path).unwrap();
let mut hs = HashSet::new();
for s in blns.split_whitespace() {
hs.insert(s);
}
let mut us = Vec::new();
let mut ss = Vec::new();
for s in blns.split_whitespace().cycle().take(100_000) {
let u = u(s);
us.push(u);
ss.push(s.to_owned());
}
let json = serde_json::to_string(super::get_cache()).unwrap();
unsafe {
super::_clear_cache();
}
let _: super::DeserializedCache = serde_json::from_str(&json).unwrap();
let mut hs_u = HashSet::new();
for s in string_cache_iter() {
hs_u.insert(s);
}
let diff: HashSet<_> = hs.difference(&hs_u).collect();
assert_eq!(super::num_entries(), hs.len());
assert_eq!(diff.len(), 0);
}
#[cfg(all(feature = "serialization", not(miri)))]
#[test]
fn serialization_ustr() {
use super::{ustr, Ustr};
let u_hello = ustr("hello");
let json = serde_json::to_string(&u_hello).unwrap();
let me_hello: Ustr = serde_json::from_str(&json).unwrap();
assert_eq!(u_hello, me_hello);
}
#[test]
fn partial_ord() {
let _t = TEST_LOCK.lock();
use super::ustr;
let str_a = ustr("aaa");
let str_z = ustr("zzz");
let str_k = ustr("kkk");
assert!(str_a < str_k);
assert!(str_k < str_z);
}
#[test]
fn ord() {
let _t = TEST_LOCK.lock();
use super::ustr;
let u_apple = ustr("apple");
let u_bravo = ustr("bravo");
let u_charlie = ustr("charlie");
let u_delta = ustr("delta");
let mut v = vec![u_delta, u_bravo, u_charlie, u_apple];
v.sort();
assert_eq!(v, vec![u_apple, u_bravo, u_charlie, u_delta]);
}
fn takes_into_str<'a, S: Into<&'a str>>(s: S) -> &'a str {
s.into()
}
#[test]
fn test_into_str() {
let _t = TEST_LOCK.lock();
use super::ustr;
assert_eq!("converted", takes_into_str(ustr("converted")));
}
#[test]
fn test_existing_ustr() {
let _t = TEST_LOCK.lock();
use super::{existing_ustr, ustr};
assert_eq!(existing_ustr("hello world!"), None);
let s1 = ustr("hello world!");
let s2 = existing_ustr("hello world!");
assert_eq!(Some(s1), s2);
}
}
lazy_static::lazy_static! {
static ref STRING_CACHE: Bins = {
use std::mem::{self, MaybeUninit};
let mut bins: [MaybeUninit<Mutex<StringCache>>; NUM_BINS] = unsafe {
MaybeUninit::uninit().assume_init()
};
for bin in &mut bins[..] {
*bin = MaybeUninit::new(Mutex::new(StringCache::default()));
}
unsafe { mem::transmute::<_, Bins>(bins) }
};
}
#[inline]
fn whichbin(hash: u64) -> usize {
((hash >> TOP_SHIFT as u64) % NUM_BINS as u64) as usize
}