use parking_lot::Mutex;
use std::{
borrow::Cow,
cmp::Ordering,
ffi::{CStr, OsStr},
fmt,
hash::{Hash, Hasher},
ops::Deref,
os::raw::c_char,
path::Path,
ptr::NonNull,
rc::Rc,
slice, str,
str::FromStr,
sync::Arc,
};
mod hash;
pub use hash::*;
mod bumpalloc;
mod stringcache;
pub use stringcache::*;
#[cfg(feature = "serde")]
pub mod serialization;
#[cfg(feature = "serde")]
pub use serialization::DeserializedCache;
#[derive(Copy, Clone, PartialEq)]
#[repr(transparent)]
pub struct Ustr {
char_ptr: NonNull<u8>,
}
impl Ord for Ustr {
fn cmp(&self, other: &Self) -> Ordering {
self.as_str().cmp(other.as_str())
}
}
#[allow(clippy::non_canonical_partial_ord_impl)]
impl PartialOrd for Ustr {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ustr {
pub fn from(string: &str) -> Ustr {
let hash = {
let mut hasher = ahash::AHasher::default();
hasher.write(string.as_bytes());
hasher.finish()
};
let mut sc = STRING_CACHE.0[whichbin(hash)].lock();
Ustr {
char_ptr: unsafe {
NonNull::new_unchecked(sc.insert(string, hash) as *mut _)
},
}
}
pub fn from_existing(string: &str) -> Option<Ustr> {
let hash = {
let mut hasher = ahash::AHasher::default();
hasher.write(string.as_bytes());
hasher.finish()
};
let sc = STRING_CACHE.0[whichbin(hash)].lock();
sc.get_existing(string, hash).map(|ptr| Ustr {
char_ptr: unsafe { NonNull::new_unchecked(ptr as *mut _) },
})
}
pub fn as_str(&self) -> &'static str {
unsafe {
str::from_utf8_unchecked(slice::from_raw_parts(
self.char_ptr.as_ptr(),
self.len(),
))
}
}
pub fn as_char_ptr(&self) -> *const c_char {
self.char_ptr.as_ptr() as *const c_char
}
pub fn as_cstr(&self) -> &CStr {
unsafe {
CStr::from_bytes_with_nul_unchecked(slice::from_raw_parts(
self.as_ptr(),
self.len() + 1,
))
}
}
#[inline]
fn as_string_cache_entry(&self) -> &StringCacheEntry {
unsafe { &*(self.char_ptr.as_ptr().cast::<StringCacheEntry>().sub(1)) }
}
#[inline]
pub fn len(&self) -> usize {
self.as_string_cache_entry().len
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
#[inline]
pub fn precomputed_hash(&self) -> u64 {
self.as_string_cache_entry().hash
}
pub fn to_owned(&self) -> String {
self.as_str().to_owned()
}
}
unsafe impl Send for Ustr {}
unsafe impl Sync for Ustr {}
impl PartialEq<str> for Ustr {
fn eq(&self, other: &str) -> bool {
self.as_str() == other
}
}
impl PartialEq<Ustr> for str {
fn eq(&self, u: &Ustr) -> bool {
self == u.as_str()
}
}
impl PartialEq<&str> for Ustr {
fn eq(&self, other: &&str) -> bool {
self.as_str() == *other
}
}
impl PartialEq<Ustr> for &str {
fn eq(&self, u: &Ustr) -> bool {
*self == u.as_str()
}
}
impl PartialEq<&&str> for Ustr {
fn eq(&self, other: &&&str) -> bool {
self.as_str() == **other
}
}
impl PartialEq<Ustr> for &&str {
fn eq(&self, u: &Ustr) -> bool {
**self == u.as_str()
}
}
impl PartialEq<String> for Ustr {
fn eq(&self, other: &String) -> bool {
self.as_str() == other
}
}
impl PartialEq<Ustr> for String {
fn eq(&self, u: &Ustr) -> bool {
self == u.as_str()
}
}
impl PartialEq<&String> for Ustr {
fn eq(&self, other: &&String) -> bool {
self.as_str() == *other
}
}
impl PartialEq<Ustr> for &String {
fn eq(&self, u: &Ustr) -> bool {
*self == u.as_str()
}
}
impl PartialEq<Box<str>> for Ustr {
fn eq(&self, other: &Box<str>) -> bool {
self.as_str() == &**other
}
}
impl PartialEq<Ustr> for Box<str> {
fn eq(&self, u: &Ustr) -> bool {
&**self == u.as_str()
}
}
impl PartialEq<Ustr> for &Box<str> {
fn eq(&self, u: &Ustr) -> bool {
&***self == u.as_str()
}
}
impl PartialEq<Cow<'_, str>> for Ustr {
fn eq(&self, other: &Cow<'_, str>) -> bool {
self.as_str() == &*other
}
}
impl PartialEq<Ustr> for Cow<'_, str> {
fn eq(&self, u: &Ustr) -> bool {
&*self == u.as_str()
}
}
impl PartialEq<&Cow<'_, str>> for Ustr {
fn eq(&self, other: &&Cow<'_, str>) -> bool {
self.as_str() == &**other
}
}
impl PartialEq<Ustr> for &Cow<'_, str> {
fn eq(&self, u: &Ustr) -> bool {
&**self == u.as_str()
}
}
impl PartialEq<Ustr> for Path {
fn eq(&self, u: &Ustr) -> bool {
self == Path::new(u)
}
}
impl PartialEq<Ustr> for &Path {
fn eq(&self, u: &Ustr) -> bool {
*self == Path::new(u)
}
}
impl PartialEq<Ustr> for OsStr {
fn eq(&self, u: &Ustr) -> bool {
self == OsStr::new(u)
}
}
impl PartialEq<Ustr> for &OsStr {
fn eq(&self, u: &Ustr) -> bool {
*self == OsStr::new(u)
}
}
impl Eq for Ustr {}
impl<T: ?Sized> AsRef<T> for Ustr
where
str: AsRef<T>,
{
fn as_ref(&self) -> &T {
self.as_str().as_ref()
}
}
impl FromStr for Ustr {
type Err = std::string::ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(Ustr::from(s))
}
}
impl From<&str> for Ustr {
fn from(s: &str) -> Ustr {
Ustr::from(s)
}
}
impl From<Ustr> for &'static str {
fn from(s: Ustr) -> &'static str {
s.as_str()
}
}
impl From<Ustr> for String {
fn from(u: Ustr) -> Self {
String::from(u.as_str())
}
}
impl From<Ustr> for Box<str> {
fn from(u: Ustr) -> Self {
Box::from(u.as_str())
}
}
impl From<Ustr> for Rc<str> {
fn from(u: Ustr) -> Self {
Rc::from(u.as_str())
}
}
impl From<Ustr> for Arc<str> {
fn from(u: Ustr) -> Self {
Arc::from(u.as_str())
}
}
impl From<Ustr> for Cow<'static, str> {
fn from(u: Ustr) -> Self {
Cow::Borrowed(u.as_str())
}
}
impl From<String> for Ustr {
fn from(s: String) -> Ustr {
Ustr::from(&s)
}
}
impl From<&String> for Ustr {
fn from(s: &String) -> Ustr {
Ustr::from(&**s)
}
}
impl From<Box<str>> for Ustr {
fn from(s: Box<str>) -> Ustr {
Ustr::from(&*s)
}
}
impl From<Rc<str>> for Ustr {
fn from(s: Rc<str>) -> Ustr {
Ustr::from(&*s)
}
}
impl From<Arc<str>> for Ustr {
fn from(s: Arc<str>) -> Ustr {
Ustr::from(&*s)
}
}
impl From<Cow<'_, str>> for Ustr {
fn from(s: Cow<'_, str>) -> Ustr {
Ustr::from(&*s)
}
}
impl Default for Ustr {
fn default() -> Self {
Ustr::from("")
}
}
impl Deref for Ustr {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
impl fmt::Display for Ustr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.as_str())
}
}
impl fmt::Debug for Ustr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "u!({:?})", self.as_str())
}
}
impl Hash for Ustr {
fn hash<H: Hasher>(&self, state: &mut H) {
self.precomputed_hash().hash(state);
}
}
#[doc(hidden)]
pub unsafe fn _clear_cache() {
for m in STRING_CACHE.0.iter() {
m.lock().clear();
}
}
pub fn total_allocated() -> usize {
STRING_CACHE
.0
.iter()
.map(|sc| {
let t = sc.lock().total_allocated();
t
})
.sum()
}
pub fn total_capacity() -> usize {
STRING_CACHE
.0
.iter()
.map(|sc| {
let t = sc.lock().total_capacity();
t
})
.sum()
}
#[inline]
pub fn ustr(s: &str) -> Ustr {
Ustr::from(s)
}
#[inline]
pub fn existing_ustr(s: &str) -> Option<Ustr> {
Ustr::from_existing(s)
}
pub fn cache() -> &'static Bins {
&STRING_CACHE
}
pub fn num_entries() -> usize {
STRING_CACHE
.0
.iter()
.map(|sc| {
let t = sc.lock().num_entries();
t
})
.sum()
}
#[doc(hidden)]
pub fn num_entries_per_bin() -> Vec<usize> {
STRING_CACHE
.0
.iter()
.map(|sc| {
let t = sc.lock().num_entries();
t
})
.collect::<Vec<_>>()
}
pub fn string_cache_iter() -> StringCacheIterator {
let mut allocs = Vec::new();
for m in STRING_CACHE.0.iter() {
let sc = m.lock();
for a in &sc.old_allocs {
allocs.push((a.ptr(), a.end()));
}
let ptr = sc.alloc.ptr();
let end = sc.alloc.end();
if ptr != end {
allocs.push((sc.alloc.ptr(), sc.alloc.end()));
}
}
let current_ptr =
allocs.first().map(|s| s.0).unwrap_or_else(std::ptr::null);
StringCacheIterator {
allocs,
current_alloc: 0,
current_ptr,
}
}
#[repr(transparent)]
pub struct Bins(pub(crate) [Mutex<StringCache>; NUM_BINS]);
#[cfg(test)]
lazy_static::lazy_static! {
static ref TEST_LOCK: Mutex<()> = Mutex::new(());
}
#[cfg(test)]
mod tests {
use super::TEST_LOCK;
use lazy_static::lazy_static;
use std::ffi::OsStr;
use std::path::Path;
use std::sync::Mutex;
#[test]
fn it_works() {
let _t = TEST_LOCK.lock();
use super::ustr as u;
let u_hello = u("hello");
assert_eq!(u_hello, "hello");
let u_world = u("world");
assert_eq!(u_world, String::from("world"));
}
#[test]
fn empty_string() {
let _t = TEST_LOCK.lock();
use super::ustr as u;
unsafe {
super::_clear_cache();
}
let _empty = u("");
let empty = u("");
assert!(empty.as_str().is_empty());
assert_eq!(super::num_entries(), 1);
}
#[test]
fn c_str_works() {
let _t = TEST_LOCK.lock();
use super::ustr as u;
use std::ffi::CStr;
let s_fox = "The quick brown fox jumps over the lazy dog.";
let u_fox = u(s_fox);
let fox = unsafe { CStr::from_ptr(u_fox.as_char_ptr()) }
.to_string_lossy()
.into_owned();
assert_eq!(fox, s_fox);
let s_odys = "Τη γλώσσα μου Îδωσαν ελληνική";
let u_odys = u(s_odys);
let odys = unsafe { CStr::from_ptr(u_odys.as_char_ptr()) }
.to_string_lossy()
.into_owned();
assert_eq!(odys, s_odys);
}
#[test]
#[cfg_attr(miri, ignore)]
fn blns() {
let _t = TEST_LOCK.lock();
use super::{string_cache_iter, ustr as u};
use std::collections::HashSet;
unsafe { super::_clear_cache() };
let blns = include_str!("../data/blns.txt");
let mut hs = HashSet::new();
for s in blns.split_whitespace() {
hs.insert(s);
}
let mut us = Vec::new();
let mut ss = Vec::new();
for s in blns.split_whitespace().cycle().take(100_000) {
let u = u(s);
us.push(u);
ss.push(s.to_owned());
}
let mut hs_u = HashSet::new();
for s in string_cache_iter() {
hs_u.insert(s);
}
let diff: HashSet<_> = hs.difference(&hs_u).collect();
assert_eq!(super::num_entries(), hs.len());
assert_eq!(diff.len(), 0);
let nbs = super::num_entries_per_bin();
println!("{:?}", nbs);
println!("Total allocated: {}", super::total_allocated());
println!("Total capacity: {}", super::total_capacity());
println!(
"size of StringCache: {}",
std::mem::size_of::<super::StringCache>()
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn raft() {
let _t = TEST_LOCK.lock();
use super::ustr as u;
use std::sync::Arc;
let raft = include_str!("../data/raft-large-directories.txt");
let raft = Arc::new(
raft.split_whitespace()
.collect::<Vec<_>>()
.chunks(3)
.map(|s| {
if s.len() == 3 {
format!("{}/{}/{}", s[0], s[1], s[2])
} else {
s[0].to_owned()
}
})
.collect::<Vec<_>>(),
);
let s = raft.clone();
for _ in 0..600 {
let mut v = Vec::with_capacity(20_000);
unsafe { super::_clear_cache() };
for s in s.iter().cycle().take(20_000) {
v.push(u(s));
}
}
}
#[cfg(all(feature = "serde", not(miri)))]
#[test]
fn serialization() {
let _t = TEST_LOCK.lock();
use super::{string_cache_iter, ustr as u};
use std::collections::HashSet;
unsafe { super::_clear_cache() };
let path = std::path::Path::new(
&std::env::var("CARGO_MANIFEST_DIR")
.expect("CARGO_MANIFEST_DIR not set"),
)
.join("data")
.join("blns.txt");
let blns = std::fs::read_to_string(path).unwrap();
let mut hs = HashSet::new();
for s in blns.split_whitespace() {
hs.insert(s);
}
let mut us = Vec::new();
let mut ss = Vec::new();
for s in blns.split_whitespace().cycle().take(100_000) {
let u = u(s);
us.push(u);
ss.push(s.to_owned());
}
let json = serde_json::to_string(super::cache()).unwrap();
unsafe {
super::_clear_cache();
}
let _: super::DeserializedCache = serde_json::from_str(&json).unwrap();
let mut hs_u = HashSet::new();
for s in string_cache_iter() {
hs_u.insert(s);
}
let diff: HashSet<_> = hs.difference(&hs_u).collect();
assert_eq!(super::num_entries(), hs.len());
assert_eq!(diff.len(), 0);
}
#[cfg(all(feature = "serde", not(miri)))]
#[test]
fn serialization_ustr() {
let _t = TEST_LOCK.lock();
use super::{ustr, Ustr};
let u_hello = ustr("hello");
let json = serde_json::to_string(&u_hello).unwrap();
let me_hello: Ustr = serde_json::from_str(&json).unwrap();
assert_eq!(u_hello, me_hello);
}
#[test]
fn partial_ord() {
let _t = TEST_LOCK.lock();
use super::ustr;
let str_a = ustr("aaa");
let str_z = ustr("zzz");
let str_k = ustr("kkk");
assert!(str_a < str_k);
assert!(str_k < str_z);
}
#[test]
fn ord() {
let _t = TEST_LOCK.lock();
use super::ustr;
let u_apple = ustr("apple");
let u_bravo = ustr("bravo");
let u_charlie = ustr("charlie");
let u_delta = ustr("delta");
let mut v = vec![u_delta, u_bravo, u_charlie, u_apple];
v.sort();
assert_eq!(v, vec![u_apple, u_bravo, u_charlie, u_delta]);
}
fn takes_into_str<'a, S: Into<&'a str>>(s: S) -> &'a str {
s.into()
}
#[test]
fn test_into_str() {
let _t = TEST_LOCK.lock();
use super::ustr;
assert_eq!("converted", takes_into_str(ustr("converted")));
}
#[test]
fn test_existing_ustr() {
let _t = TEST_LOCK.lock();
use super::{existing_ustr, ustr};
assert_eq!(existing_ustr("hello world!"), None);
let s1 = ustr("hello world!");
let s2 = existing_ustr("hello world!");
assert_eq!(Some(s1), s2);
}
#[test]
fn test_empty_cache() {
unsafe { super::_clear_cache() };
assert_eq!(
super::string_cache_iter().collect::<Vec<_>>(),
Vec::<&'static str>::new()
);
}
#[test]
fn as_refs() {
let _t = TEST_LOCK.lock();
let u = super::ustr("test");
let s: String = u.to_owned();
assert_eq!(u, s);
assert_eq!(s, u);
let p: &Path = u.as_ref();
assert_eq!(p, u);
let _: &[u8] = u.as_ref();
let o: &OsStr = u.as_ref();
assert_eq!(p, o);
assert_eq!(o, p);
let cow = std::borrow::Cow::from(u);
assert_eq!(cow, u);
assert_eq!(u, cow);
let boxed: Box<str> = u.into();
assert_eq!(boxed, u);
}
}
lazy_static::lazy_static! {
static ref STRING_CACHE: Bins = {
use std::mem::{self, MaybeUninit};
let mut bins: [MaybeUninit<Mutex<StringCache>>; NUM_BINS] = unsafe {
MaybeUninit::uninit().assume_init()
};
for bin in &mut bins[..] {
*bin = MaybeUninit::new(Mutex::new(StringCache::default()));
}
unsafe { mem::transmute::<_, Bins>(bins) }
};
}
#[inline]
fn whichbin(hash: u64) -> usize {
((hash >> TOP_SHIFT as u64) % NUM_BINS as u64) as usize
}