ustr/
lib.rs

1//! Fast, FFI-friendly string interning. A [`Ustr`] (**U**nique **Str**) is a
2//! lightweight handle representing a static, immutable entry in a global string
3//! cache, allowing for:
4//!
5//! * Extremely fast string assignment and comparisons -- it's just a pointer
6//!   comparison.
7//!
8//! * Efficient storage -- only one copy of the string is held in memory, and
9//!   getting access to it is just a pointer indirection.
10//!
11//! * Fast hashing -- the precomputed hash is stored with the string.
12//!
13//! * Fast FFI -- the string is stored with a terminating null byte so can be
14//!   passed to C directly without doing the `CString` dance.
15//!
16//! The downside is no strings are ever freed, so if you're creating lots and
17//! lots of strings, you might run out of memory. On the other hand, War and
18//! Peace is only 3MB, so it's probably fine.
19//!
20//! This crate is based on [OpenImageIO's](https://openimageio.readthedocs.io/en/v2.4.10.0/)
21//! (OIIO) [`ustring`](https://github.com/OpenImageIO/oiio/blob/master/src/include/OpenImageIO/ustring.h)
22//! but it is *not* binary-compatible (yet). The underlying hash map
23//! implementation is directy ported from OIIO.
24//!
25//! # Usage
26//!
27//! ```
28//! use ustr::{Ustr, ustr, ustr as u};
29//!
30//! # unsafe { ustr::_clear_cache() };
31//! // Creation is quick and easy using either `Ustr::from` or the ustr function
32//! // and only one copy of any string is stored.
33//! let u1 = Ustr::from("the quick brown fox");
34//! let u2 = ustr("the quick brown fox");
35//!
36//! // Comparisons and copies are extremely cheap.
37//! let u3 = u1;
38//! assert_eq!(u2, u3);
39//!
40//! // You can pass straight to FFI.
41//! let len = unsafe {
42//!     libc::strlen(u1.as_char_ptr())
43//! };
44//! assert_eq!(len, 19);
45//!
46//! // Use as_str() to get a `str`.
47//! let words: Vec<&str> = u1.as_str().split_whitespace().collect();
48//! assert_eq!(words, ["the", "quick", "brown", "fox"]);
49//!
50//! // For best performance when using Ustr as key for a HashMap or HashSet,
51//! // you'll want to use the precomputed hash. To make this easier, just use
52//! // the UstrMap and UstrSet exports:
53//! use ustr::UstrMap;
54//!
55//! // Key type is always `Ustr`.
56//! let mut map: UstrMap<usize> = UstrMap::default();
57//! map.insert(u1, 17);
58//! assert_eq!(*map.get(&u1).unwrap(), 17);
59//! ```
60//!
61//! By enabling the `"serde"` feature you can serialize individual `Ustr`s
62//! or the whole cache with serde.
63//!
64//! ```
65//! # #[cfg(feature = "serde")] {
66//! use ustr::{Ustr, ustr};
67//! let u_ser = ustr("serde");
68//! let json = serde_json::to_string(&u_ser).unwrap();
69//! let u_de : Ustr = serde_json::from_str(&json).unwrap();
70//! assert_eq!(u_ser, u_de);
71//! # }
72//! ```
73//!
74//! Since the cache is global, use the `ustr::DeserializedCache` dummy object to
75//! drive the deserialization.
76//!
77//! ```
78//! # #[cfg(feature = "serde")] {
79//! use ustr::{Ustr, ustr};
80//! ustr("Send me to JSON and back");
81//! let json = serde_json::to_string(ustr::cache()).unwrap();
82//!
83//! // ... some time later ...
84//! let _: ustr::DeserializedCache = serde_json::from_str(&json).unwrap();
85//! assert_eq!(ustr::num_entries(), 1);
86//! assert_eq!(ustr::string_cache_iter().collect::<Vec<_>>(), vec!["Send me to JSON and back"]);
87//! # }
88//! ```
89//!
90//! ## Why?
91//!
92//! It is common in certain types of applications to use strings as identifiers,
93//! but not really do any processing with them.
94//! To paraphrase from OIIO's `Ustring` documentation -- compared to standard
95//! strings, `Ustr`s have several advantages:
96//!
97//!   - Each individual `Ustr` is very small -- in fact, we guarantee that a
98//!     `Ustr` is the same size and memory layout as an ordinary `*u8`.
99//!
100//!   - Storage is frugal, since there is only one allocated copy of each unique
101//!     character sequence, throughout the lifetime of the program.
102//!
103//!   - Assignment from one `Ustr` to another is just copy of the pointer; no
104//!     allocation, no character copying, no reference counting.
105//!
106//!   - Equality testing (do the strings contain the same characters) is a
107//!     single operation, the comparison of the pointer.
108//!
109//!   - Memory allocation only occurs when a new `Ustr` is constructed from raw
110//!     characters the FIRST time -- subsequent constructions of the same string
111//!     just finds it in the canonial string set, but doesn't need to allocate
112//!     new storage.  Destruction of a `Ustr` is trivial, there is no
113//!     de-allocation because the canonical version stays in the set.  Also,
114//!     therefore, no user code mistake can lead to memory leaks.
115//!
116//! But there are some problems, too.  Canonical strings are never freed
117//! from the table.  So in some sense all the strings "leak", but they
118//! only leak one copy for each unique string that the program ever comes
119//! across.
120//!
121//! On the whole, `Ustr`s are a really great string representation
122//!
123//!   - if you tend to have (relatively) few unique strings, but many copies of
124//!     those strings;
125//!
126//!   - if the creation of strings from raw characters is relatively rare
127//!     compared to copying or comparing to existing strings;
128//!
129//!   - if you tend to make the same strings over and over again, and if it's
130//!     relatively rare that a single unique character sequence is used only
131//!     once in the entire lifetime of the program;
132//!
133//!   - if your most common string operations are assignment and equality
134//!     testing and you want them to be as fast as possible;
135//!
136//!   - if you are doing relatively little character-by-character assembly of
137//!     strings, string concatenation, or other "string manipulation" (other
138//!     than equality testing).
139//!
140//! `Ustr`s are not so hot
141//!
142//!   - if your program tends to have very few copies of each character sequence
143//!     over the entire lifetime of the program;
144//!
145//!   - if your program tends to generate a huge variety of unique strings over
146//!     its lifetime, each of which is used only a short time and then
147//!     discarded, never to be needed again;
148//!
149//!   - if you don't need to do a lot of string assignment or equality testing,
150//!     but lots of more complex string manipulation.
151//!
152//! ## Safety and Compatibility
153//!
154//! This crate contains a significant amount of unsafe but usage has been
155//! checked and is well-documented. It is also run through Miri as part of the
156//! CI process. I use it regularly on 64-bit systems, and it has passed Miri on
157//! a 32-bit system as well, bit 32-bit is not checked regularly. If you want to
158//! use it on 32-bit, please make sure to run Miri and open and issue if you
159//! find any problems.
160use parking_lot::Mutex;
161use std::{
162    borrow::Cow,
163    cmp::Ordering,
164    ffi::{CStr, OsStr},
165    fmt,
166    hash::{Hash, Hasher},
167    ops::Deref,
168    os::raw::c_char,
169    path::Path,
170    ptr::NonNull,
171    rc::Rc,
172    slice, str,
173    str::FromStr,
174    sync::Arc,
175};
176
177mod hash;
178pub use hash::*;
179mod bumpalloc;
180
181mod stringcache;
182pub use stringcache::*;
183#[cfg(feature = "serde")]
184pub mod serialization;
185#[cfg(feature = "serde")]
186pub use serialization::DeserializedCache;
187
188/// A handle representing a string in the global string cache.
189///
190/// To use, create one using [`Ustr::from`] or the [`ustr`] function. You can
191/// freely copy, destroy or send `Ustr`s to other threads: the underlying string
192/// is always valid in memory (and is never destroyed).
193#[derive(Copy, Clone, PartialEq)]
194#[repr(transparent)]
195pub struct Ustr {
196    char_ptr: NonNull<u8>,
197}
198
199/// Defer to `str` for equality.
200///
201/// Lexicographic ordering will be slower than pointer comparison, but much less
202/// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`.
203impl Ord for Ustr {
204    fn cmp(&self, other: &Self) -> Ordering {
205        self.as_str().cmp(other.as_str())
206    }
207}
208
209/// Defer to `str` for equality.
210///
211/// Lexicographic ordering will be slower thanpointer comparison, but much less
212/// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`.
213#[allow(clippy::non_canonical_partial_ord_impl)]
214impl PartialOrd for Ustr {
215    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
216        Some(self.cmp(other))
217    }
218}
219
220impl Ustr {
221    /// Create a new `Ustr` from the given `str`.
222    ///
223    /// You can also use the [`ustr`] function.
224    ///
225    /// # Examples
226    ///
227    /// ```
228    /// use ustr::{Ustr, ustr as u};
229    /// # unsafe { ustr::_clear_cache() };
230    ///
231    /// let u1 = Ustr::from("the quick brown fox");
232    /// let u2 = u("the quick brown fox");
233    /// assert_eq!(u1, u2);
234    /// assert_eq!(ustr::num_entries(), 1);
235    /// ```
236    pub fn from(string: &str) -> Ustr {
237        let hash = {
238            let mut hasher = ahash::AHasher::default();
239            hasher.write(string.as_bytes());
240            hasher.finish()
241        };
242        let mut sc = STRING_CACHE.0[whichbin(hash)].lock();
243        Ustr {
244            // SAFETY: sc.insert does not give back a null pointer
245            char_ptr: unsafe {
246                NonNull::new_unchecked(sc.insert(string, hash) as *mut _)
247            },
248        }
249    }
250
251    pub fn from_existing(string: &str) -> Option<Ustr> {
252        let hash = {
253            let mut hasher = ahash::AHasher::default();
254            hasher.write(string.as_bytes());
255            hasher.finish()
256        };
257        let sc = STRING_CACHE.0[whichbin(hash)].lock();
258        sc.get_existing(string, hash).map(|ptr| Ustr {
259            char_ptr: unsafe { NonNull::new_unchecked(ptr as *mut _) },
260        })
261    }
262
263    /// Get the cached `Ustr` as a `str`.
264    ///
265    /// # Examples
266    ///
267    /// ```
268    /// use ustr::ustr as u;
269    /// # unsafe { ustr::_clear_cache() };
270    ///
271    /// let u_fox = u("the quick brown fox");
272    /// let words: Vec<&str> = u_fox.as_str().split_whitespace().collect();
273    /// assert_eq!(words, ["the", "quick", "brown", "fox"]);
274    /// ```
275    pub fn as_str(&self) -> &'static str {
276        // This is safe if:
277        // 1) self.char_ptr points to a valid address
278        // 2) len is a usize stored usize aligned usize bytes before char_ptr
279        // 3) char_ptr points to a valid UTF-8 string of len bytes.
280        // All these are guaranteed by StringCache::insert() and by the fact
281        // we can only construct a Ustr from a valid &str.
282        unsafe {
283            str::from_utf8_unchecked(slice::from_raw_parts(
284                self.char_ptr.as_ptr(),
285                self.len(),
286            ))
287        }
288    }
289
290    /// Get the cached string as a C `char*`.
291    ///
292    /// This includes the null terminator so is safe to pass straight to FFI.
293    ///
294    /// # Examples
295    ///
296    /// ```
297    /// use ustr::ustr as u;
298    /// # unsafe { ustr::_clear_cache() };
299    ///
300    /// let u_fox = u("the quick brown fox");
301    /// let len = unsafe {
302    ///     libc::strlen(u_fox.as_char_ptr())
303    /// };
304    /// assert_eq!(len, 19);
305    /// ```
306    ///
307    /// # Safety
308    ///
309    /// This is just passing a raw byte array with a null terminator to C. If
310    /// your source string contains non-ascii bytes then this will pass them
311    /// straight along with no checking.
312    ///
313    /// The string is **immutable**. That means that if you modify it across the
314    /// FFI boundary then all sorts of terrible things will happen.
315    pub fn as_char_ptr(&self) -> *const c_char {
316        self.char_ptr.as_ptr() as *const c_char
317    }
318
319    /// Get this `Ustr` as a [`CStr`]
320    ///
321    /// This is useful for passing to APIs (like ash) that use `CStr`.
322    ///
323    /// # Safety
324    ///
325    /// This function by itself is safe as the pointer and length are guaranteed
326    /// to be valid. All the same caveats for the use of the `CStr` as given in
327    /// the `CStr` docs apply.
328    pub fn as_cstr(&self) -> &CStr {
329        unsafe {
330            CStr::from_bytes_with_nul_unchecked(slice::from_raw_parts(
331                self.as_ptr(),
332                self.len() + 1,
333            ))
334        }
335    }
336
337    /// Get a raw pointer to the `StringCacheEntry`.
338    #[inline]
339    fn as_string_cache_entry(&self) -> &StringCacheEntry {
340        // The allocator guarantees that the alignment is correct and that
341        // this pointer is non-null
342        unsafe { &*(self.char_ptr.as_ptr().cast::<StringCacheEntry>().sub(1)) }
343    }
344
345    /// Get the length (in bytes) of this string.
346    #[inline]
347    pub fn len(&self) -> usize {
348        self.as_string_cache_entry().len
349    }
350
351    /// Returns true if the length is zero.
352    pub fn is_empty(&self) -> bool {
353        self.len() == 0
354    }
355
356    /// Get the precomputed hash for this string.
357    #[inline]
358    pub fn precomputed_hash(&self) -> u64 {
359        self.as_string_cache_entry().hash
360    }
361
362    /// Get an owned String copy of this string.
363    pub fn to_owned(&self) -> String {
364        self.as_str().to_owned()
365    }
366}
367
368// We're safe to impl these because the strings they reference are immutable
369// and for all intents and purposes 'static since they're never deleted after
370// being created
371unsafe impl Send for Ustr {}
372unsafe impl Sync for Ustr {}
373
374impl PartialEq<str> for Ustr {
375    fn eq(&self, other: &str) -> bool {
376        self.as_str() == other
377    }
378}
379
380impl PartialEq<Ustr> for str {
381    fn eq(&self, u: &Ustr) -> bool {
382        self == u.as_str()
383    }
384}
385
386impl PartialEq<&str> for Ustr {
387    fn eq(&self, other: &&str) -> bool {
388        self.as_str() == *other
389    }
390}
391
392impl PartialEq<Ustr> for &str {
393    fn eq(&self, u: &Ustr) -> bool {
394        *self == u.as_str()
395    }
396}
397
398impl PartialEq<&&str> for Ustr {
399    fn eq(&self, other: &&&str) -> bool {
400        self.as_str() == **other
401    }
402}
403
404impl PartialEq<Ustr> for &&str {
405    fn eq(&self, u: &Ustr) -> bool {
406        **self == u.as_str()
407    }
408}
409
410impl PartialEq<String> for Ustr {
411    fn eq(&self, other: &String) -> bool {
412        self.as_str() == other
413    }
414}
415
416impl PartialEq<Ustr> for String {
417    fn eq(&self, u: &Ustr) -> bool {
418        self == u.as_str()
419    }
420}
421
422impl PartialEq<&String> for Ustr {
423    fn eq(&self, other: &&String) -> bool {
424        self.as_str() == *other
425    }
426}
427
428impl PartialEq<Ustr> for &String {
429    fn eq(&self, u: &Ustr) -> bool {
430        *self == u.as_str()
431    }
432}
433
434impl PartialEq<Box<str>> for Ustr {
435    fn eq(&self, other: &Box<str>) -> bool {
436        self.as_str() == &**other
437    }
438}
439
440impl PartialEq<Ustr> for Box<str> {
441    fn eq(&self, u: &Ustr) -> bool {
442        &**self == u.as_str()
443    }
444}
445
446impl PartialEq<Ustr> for &Box<str> {
447    fn eq(&self, u: &Ustr) -> bool {
448        &***self == u.as_str()
449    }
450}
451
452impl PartialEq<Cow<'_, str>> for Ustr {
453    fn eq(&self, other: &Cow<'_, str>) -> bool {
454        self.as_str() == &*other
455    }
456}
457
458impl PartialEq<Ustr> for Cow<'_, str> {
459    fn eq(&self, u: &Ustr) -> bool {
460        &*self == u.as_str()
461    }
462}
463
464impl PartialEq<&Cow<'_, str>> for Ustr {
465    fn eq(&self, other: &&Cow<'_, str>) -> bool {
466        self.as_str() == &**other
467    }
468}
469
470impl PartialEq<Ustr> for &Cow<'_, str> {
471    fn eq(&self, u: &Ustr) -> bool {
472        &**self == u.as_str()
473    }
474}
475
476impl PartialEq<Ustr> for Path {
477    fn eq(&self, u: &Ustr) -> bool {
478        self == Path::new(u)
479    }
480}
481
482impl PartialEq<Ustr> for &Path {
483    fn eq(&self, u: &Ustr) -> bool {
484        *self == Path::new(u)
485    }
486}
487
488impl PartialEq<Ustr> for OsStr {
489    fn eq(&self, u: &Ustr) -> bool {
490        self == OsStr::new(u)
491    }
492}
493
494impl PartialEq<Ustr> for &OsStr {
495    fn eq(&self, u: &Ustr) -> bool {
496        *self == OsStr::new(u)
497    }
498}
499
500impl Eq for Ustr {}
501
502impl<T: ?Sized> AsRef<T> for Ustr
503where
504    str: AsRef<T>,
505{
506    fn as_ref(&self) -> &T {
507        self.as_str().as_ref()
508    }
509}
510
511impl FromStr for Ustr {
512    type Err = std::string::ParseError;
513
514    #[inline]
515    fn from_str(s: &str) -> Result<Self, Self::Err> {
516        Ok(Ustr::from(s))
517    }
518}
519
520impl From<&str> for Ustr {
521    fn from(s: &str) -> Ustr {
522        Ustr::from(s)
523    }
524}
525
526impl From<Ustr> for &'static str {
527    fn from(s: Ustr) -> &'static str {
528        s.as_str()
529    }
530}
531
532impl From<Ustr> for String {
533    fn from(u: Ustr) -> Self {
534        String::from(u.as_str())
535    }
536}
537
538impl From<Ustr> for Box<str> {
539    fn from(u: Ustr) -> Self {
540        Box::from(u.as_str())
541    }
542}
543
544impl From<Ustr> for Rc<str> {
545    fn from(u: Ustr) -> Self {
546        Rc::from(u.as_str())
547    }
548}
549
550impl From<Ustr> for Arc<str> {
551    fn from(u: Ustr) -> Self {
552        Arc::from(u.as_str())
553    }
554}
555
556impl From<Ustr> for Cow<'static, str> {
557    fn from(u: Ustr) -> Self {
558        Cow::Borrowed(u.as_str())
559    }
560}
561
562impl From<String> for Ustr {
563    fn from(s: String) -> Ustr {
564        Ustr::from(&s)
565    }
566}
567
568impl From<&String> for Ustr {
569    fn from(s: &String) -> Ustr {
570        Ustr::from(&**s)
571    }
572}
573
574impl From<Box<str>> for Ustr {
575    fn from(s: Box<str>) -> Ustr {
576        Ustr::from(&*s)
577    }
578}
579
580impl From<Rc<str>> for Ustr {
581    fn from(s: Rc<str>) -> Ustr {
582        Ustr::from(&*s)
583    }
584}
585
586impl From<Arc<str>> for Ustr {
587    fn from(s: Arc<str>) -> Ustr {
588        Ustr::from(&*s)
589    }
590}
591
592impl From<Cow<'_, str>> for Ustr {
593    fn from(s: Cow<'_, str>) -> Ustr {
594        Ustr::from(&*s)
595    }
596}
597
598impl Default for Ustr {
599    fn default() -> Self {
600        Ustr::from("")
601    }
602}
603
604impl Deref for Ustr {
605    type Target = str;
606    fn deref(&self) -> &Self::Target {
607        self.as_str()
608    }
609}
610
611impl fmt::Display for Ustr {
612    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
613        write!(f, "{}", self.as_str())
614    }
615}
616
617impl fmt::Debug for Ustr {
618    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
619        write!(f, "u!({:?})", self.as_str())
620    }
621}
622
623// Just feed the precomputed hash into the Hasher. Note that this will of course
624// be terrible unless the Hasher in question is expecting a precomputed hash.
625impl Hash for Ustr {
626    fn hash<H: Hasher>(&self, state: &mut H) {
627        self.precomputed_hash().hash(state);
628    }
629}
630
631/// DO NOT CALL THIS.
632///
633/// Clears the cache -- used for benchmarking and testing purposes to clear the
634/// cache. Calling this will invalidate any previously created `UStr`s and
635/// probably cause your house to burn down. DO NOT CALL THIS.
636///
637/// # Safety
638///
639/// DO NOT CALL THIS.
640#[doc(hidden)]
641pub unsafe fn _clear_cache() {
642    for m in STRING_CACHE.0.iter() {
643        m.lock().clear();
644    }
645}
646
647/// Returns the total amount of memory allocated and in use by the cache in
648/// bytes.
649pub fn total_allocated() -> usize {
650    STRING_CACHE
651        .0
652        .iter()
653        .map(|sc| {
654            let t = sc.lock().total_allocated();
655
656            t
657        })
658        .sum()
659}
660
661/// Returns the total amount of memory reserved by the cache in bytes.
662pub fn total_capacity() -> usize {
663    STRING_CACHE
664        .0
665        .iter()
666        .map(|sc| {
667            let t = sc.lock().total_capacity();
668            t
669        })
670        .sum()
671}
672
673/// Create a new `Ustr` from the given `str`.
674///
675/// # Examples
676///
677/// ```
678/// use ustr::ustr;
679/// # unsafe { ustr::_clear_cache() };
680///
681/// let u1 = ustr("the quick brown fox");
682/// let u2 = ustr("the quick brown fox");
683/// assert_eq!(u1, u2);
684/// assert_eq!(ustr::num_entries(), 1);
685/// ```
686#[inline]
687pub fn ustr(s: &str) -> Ustr {
688    Ustr::from(s)
689}
690
691/// Create a new `Ustr` from the given `str` but only if it already exists in
692/// the string cache.
693///
694/// # Examples
695///
696/// ```
697/// use ustr::{ustr, existing_ustr};
698/// # unsafe { ustr::_clear_cache() };
699///
700/// let u1 = existing_ustr("the quick brown fox");
701/// let u2 = ustr("the quick brown fox");
702/// let u3 = existing_ustr("the quick brown fox");
703/// assert_eq!(u1, None);
704/// assert_eq!(u3, Some(u2));
705/// ```
706#[inline]
707pub fn existing_ustr(s: &str) -> Option<Ustr> {
708    Ustr::from_existing(s)
709}
710
711/// Utility function to get a reference to the main cache object for use with
712/// serialization.
713///
714/// # Examples
715///
716/// ```
717/// # use ustr::{Ustr, ustr, ustr as u};
718/// # #[cfg(feature="serde")]
719/// # {
720/// # unsafe { ustr::_clear_cache() };
721/// ustr("Send me to JSON and back");
722/// let json = serde_json::to_string(ustr::cache()).unwrap();
723/// # }
724pub fn cache() -> &'static Bins {
725    &STRING_CACHE
726}
727
728/// Returns the number of unique strings in the cache.
729///
730/// This may be an underestimate if other threads are writing to the cache
731/// concurrently.
732///
733/// # Examples
734///
735/// ```
736/// use ustr::ustr as u;
737///
738/// let _ = u("Hello");
739/// let _ = u(", World!");
740/// assert_eq!(ustr::num_entries(), 2);
741/// ```
742pub fn num_entries() -> usize {
743    STRING_CACHE
744        .0
745        .iter()
746        .map(|sc| {
747            let t = sc.lock().num_entries();
748            t
749        })
750        .sum()
751}
752
753#[doc(hidden)]
754pub fn num_entries_per_bin() -> Vec<usize> {
755    STRING_CACHE
756        .0
757        .iter()
758        .map(|sc| {
759            let t = sc.lock().num_entries();
760            t
761        })
762        .collect::<Vec<_>>()
763}
764
765/// Return an iterator over the entire string cache.
766///
767/// If another thread is adding strings concurrently to this call then they
768/// might not show up in the view of the cache presented by this iterator.
769///
770/// # Safety
771///
772/// This returns an iterator to the state of the cache at the time when
773/// `string_cache_iter()` was called. It is of course possible that another
774/// thread will add more strings to the cache after this, but since we never
775/// destroy the strings, they remain valid, meaning it's safe to iterate over
776/// them, the list just might not be completely up to date.
777pub fn string_cache_iter() -> StringCacheIterator {
778    let mut allocs = Vec::new();
779    for m in STRING_CACHE.0.iter() {
780        let sc = m.lock();
781        // the start of the allocator's data is actually the ptr, start() just
782        // points to the beginning of the allocated region. The first bytes will
783        // be uninitialized since we're bumping down
784        for a in &sc.old_allocs {
785            allocs.push((a.ptr(), a.end()));
786        }
787        let ptr = sc.alloc.ptr();
788        let end = sc.alloc.end();
789        if ptr != end {
790            allocs.push((sc.alloc.ptr(), sc.alloc.end()));
791        }
792    }
793
794    let current_ptr =
795        allocs.first().map(|s| s.0).unwrap_or_else(std::ptr::null);
796
797    StringCacheIterator {
798        allocs,
799        current_alloc: 0,
800        current_ptr,
801    }
802}
803
804/// The type used for the global string cache.
805///
806/// This is exposed to allow e.g. serialization of the data returned by the
807/// [`cache()`] function.
808#[repr(transparent)]
809pub struct Bins(pub(crate) [Mutex<StringCache>; NUM_BINS]);
810
811#[cfg(test)]
812lazy_static::lazy_static! {
813    static ref TEST_LOCK: Mutex<()> = Mutex::new(());
814}
815
816#[cfg(test)]
817mod tests {
818    use super::TEST_LOCK;
819    use lazy_static::lazy_static;
820    use std::ffi::OsStr;
821    use std::path::Path;
822    use std::sync::Mutex;
823
824    #[test]
825    fn it_works() {
826        let _t = TEST_LOCK.lock();
827        use super::ustr as u;
828
829        let u_hello = u("hello");
830        assert_eq!(u_hello, "hello");
831        let u_world = u("world");
832        assert_eq!(u_world, String::from("world"));
833    }
834
835    #[test]
836    fn empty_string() {
837        let _t = TEST_LOCK.lock();
838        use super::ustr as u;
839
840        unsafe {
841            super::_clear_cache();
842        }
843
844        let _empty = u("");
845        let empty = u("");
846
847        assert!(empty.as_str().is_empty());
848        assert_eq!(super::num_entries(), 1);
849    }
850
851    #[test]
852    fn c_str_works() {
853        let _t = TEST_LOCK.lock();
854        use super::ustr as u;
855        use std::ffi::CStr;
856
857        let s_fox = "The quick brown fox jumps over the lazy dog.";
858        let u_fox = u(s_fox);
859        let fox = unsafe { CStr::from_ptr(u_fox.as_char_ptr()) }
860            .to_string_lossy()
861            .into_owned();
862        assert_eq!(fox, s_fox);
863
864        let s_odys = "Τη γλώσσα μου έδωσαν ελληνική";
865        let u_odys = u(s_odys);
866        let odys = unsafe { CStr::from_ptr(u_odys.as_char_ptr()) }
867            .to_string_lossy()
868            .into_owned();
869        assert_eq!(odys, s_odys);
870    }
871
872    #[test]
873    // We have to disable miri here as it's far too slow unfortunately
874    #[cfg_attr(miri, ignore)]
875    fn blns() {
876        let _t = TEST_LOCK.lock();
877        use super::{string_cache_iter, ustr as u};
878        use std::collections::HashSet;
879
880        // clear the cache first or our results will be wrong
881        unsafe { super::_clear_cache() };
882
883        // let path =
884        // std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap())
885        //     .join("data")
886        //     .join("blns.txt");
887        // let blns = std::fs::read_to_string(path).unwrap();
888        let blns = include_str!("../data/blns.txt");
889
890        let mut hs = HashSet::new();
891        for s in blns.split_whitespace() {
892            hs.insert(s);
893        }
894
895        let mut us = Vec::new();
896        let mut ss = Vec::new();
897
898        for s in blns.split_whitespace().cycle().take(100_000) {
899            let u = u(s);
900            us.push(u);
901            ss.push(s.to_owned());
902        }
903
904        let mut hs_u = HashSet::new();
905        for s in string_cache_iter() {
906            hs_u.insert(s);
907        }
908        let diff: HashSet<_> = hs.difference(&hs_u).collect();
909
910        // check that the number of entries is the same
911        assert_eq!(super::num_entries(), hs.len());
912
913        // check that we have the exact same (unique) strings in the cache as in
914        // the source data
915        assert_eq!(diff.len(), 0);
916
917        let nbs = super::num_entries_per_bin();
918        println!("{:?}", nbs);
919
920        println!("Total allocated: {}", super::total_allocated());
921        println!("Total capacity: {}", super::total_capacity());
922
923        println!(
924            "size of StringCache: {}",
925            std::mem::size_of::<super::StringCache>()
926        );
927    }
928
929    #[test]
930    // We have to disable miri here as it's far too slow unfortunately
931    #[cfg_attr(miri, ignore)]
932    fn raft() {
933        let _t = TEST_LOCK.lock();
934        use super::ustr as u;
935        use std::sync::Arc;
936
937        // let path =
938        // std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap())
939        //     .join("data")
940        //     .join("raft-large-directories.txt");
941        // let raft = std::fs::read_to_string(path).unwrap();
942        let raft = include_str!("../data/raft-large-directories.txt");
943        let raft = Arc::new(
944            raft.split_whitespace()
945                .collect::<Vec<_>>()
946                .chunks(3)
947                .map(|s| {
948                    if s.len() == 3 {
949                        format!("{}/{}/{}", s[0], s[1], s[2])
950                    } else {
951                        s[0].to_owned()
952                    }
953                })
954                .collect::<Vec<_>>(),
955        );
956
957        let s = raft.clone();
958        for _ in 0..600 {
959            let mut v = Vec::with_capacity(20_000);
960            unsafe { super::_clear_cache() };
961            for s in s.iter().cycle().take(20_000) {
962                v.push(u(s));
963            }
964        }
965    }
966
967    // This test is to have miri check the allocation code paths, but miri
968    // can't open files so it's not usable right now
969    // #[test]
970    // fn words() {
971    //     let _t = TEST_LOCK.lock();
972    //     use super::ustr as u;
973    //     use std::sync::Arc;
974
975    //     let path = std::path::Path::new("/usr/share/dict/words");
976    //     let wordlist = std::fs::read_to_string(path).unwrap();
977    //     let wordlist = Arc::new(
978    //         wordlist
979    //             .split_whitespace()
980    //             .collect::<Vec<_>>()
981    //             .chunks(7)
982    //             .cycle()
983    //             .take(4_000_000)
984    //             .enumerate()
985    //             .map(|(i, s)| u(&format!("{}{}", i, s.join("-"))))
986    //             .collect::<Vec<_>>(),
987    //     );
988    // }
989
990    #[cfg(all(feature = "serde", not(miri)))]
991    #[test]
992    fn serialization() {
993        let _t = TEST_LOCK.lock();
994        use super::{string_cache_iter, ustr as u};
995        use std::collections::HashSet;
996
997        // clear the cache first or our results will be wrong
998        unsafe { super::_clear_cache() };
999
1000        let path = std::path::Path::new(
1001            &std::env::var("CARGO_MANIFEST_DIR")
1002                .expect("CARGO_MANIFEST_DIR not set"),
1003        )
1004        .join("data")
1005        .join("blns.txt");
1006        let blns = std::fs::read_to_string(path).unwrap();
1007
1008        let mut hs = HashSet::new();
1009        for s in blns.split_whitespace() {
1010            hs.insert(s);
1011        }
1012
1013        let mut us = Vec::new();
1014        let mut ss = Vec::new();
1015
1016        for s in blns.split_whitespace().cycle().take(100_000) {
1017            let u = u(s);
1018            us.push(u);
1019            ss.push(s.to_owned());
1020        }
1021
1022        let json = serde_json::to_string(super::cache()).unwrap();
1023        unsafe {
1024            super::_clear_cache();
1025        }
1026        let _: super::DeserializedCache = serde_json::from_str(&json).unwrap();
1027
1028        // now check that we've got the same data in the cache still
1029        let mut hs_u = HashSet::new();
1030        for s in string_cache_iter() {
1031            hs_u.insert(s);
1032        }
1033        let diff: HashSet<_> = hs.difference(&hs_u).collect();
1034
1035        // check that the number of entries is the same
1036        assert_eq!(super::num_entries(), hs.len());
1037
1038        // check that we have the exact same (unique) strings in the cache as in
1039        // the source data
1040        assert_eq!(diff.len(), 0);
1041    }
1042
1043    #[cfg(all(feature = "serde", not(miri)))]
1044    #[test]
1045    fn serialization_ustr() {
1046        let _t = TEST_LOCK.lock();
1047
1048        use super::{ustr, Ustr};
1049
1050        let u_hello = ustr("hello");
1051
1052        let json = serde_json::to_string(&u_hello).unwrap();
1053        let me_hello: Ustr = serde_json::from_str(&json).unwrap();
1054
1055        assert_eq!(u_hello, me_hello);
1056    }
1057
1058    #[test]
1059    fn partial_ord() {
1060        let _t = TEST_LOCK.lock();
1061        use super::ustr;
1062        let str_a = ustr("aaa");
1063        let str_z = ustr("zzz");
1064        let str_k = ustr("kkk");
1065        assert!(str_a < str_k);
1066        assert!(str_k < str_z);
1067    }
1068
1069    #[test]
1070    fn ord() {
1071        let _t = TEST_LOCK.lock();
1072        use super::ustr;
1073        let u_apple = ustr("apple");
1074        let u_bravo = ustr("bravo");
1075        let u_charlie = ustr("charlie");
1076        let u_delta = ustr("delta");
1077
1078        let mut v = vec![u_delta, u_bravo, u_charlie, u_apple];
1079        v.sort();
1080        assert_eq!(v, vec![u_apple, u_bravo, u_charlie, u_delta]);
1081    }
1082
1083    fn takes_into_str<'a, S: Into<&'a str>>(s: S) -> &'a str {
1084        s.into()
1085    }
1086
1087    #[test]
1088    fn test_into_str() {
1089        let _t = TEST_LOCK.lock();
1090        use super::ustr;
1091
1092        assert_eq!("converted", takes_into_str(ustr("converted")));
1093    }
1094
1095    #[test]
1096    fn test_existing_ustr() {
1097        let _t = TEST_LOCK.lock();
1098        use super::{existing_ustr, ustr};
1099        assert_eq!(existing_ustr("hello world!"), None);
1100        let s1 = ustr("hello world!");
1101        let s2 = existing_ustr("hello world!");
1102        assert_eq!(Some(s1), s2);
1103    }
1104
1105    #[test]
1106    fn test_empty_cache() {
1107        unsafe { super::_clear_cache() };
1108        assert_eq!(
1109            super::string_cache_iter().collect::<Vec<_>>(),
1110            Vec::<&'static str>::new()
1111        );
1112    }
1113
1114    #[test]
1115    fn as_refs() {
1116        let _t = TEST_LOCK.lock();
1117
1118        let u = super::ustr("test");
1119
1120        let s: String = u.to_owned();
1121        assert_eq!(u, s);
1122        assert_eq!(s, u);
1123
1124        let p: &Path = u.as_ref();
1125        assert_eq!(p, u);
1126
1127        let _: &[u8] = u.as_ref();
1128
1129        let o: &OsStr = u.as_ref();
1130        assert_eq!(p, o);
1131        assert_eq!(o, p);
1132
1133        let cow = std::borrow::Cow::from(u);
1134        assert_eq!(cow, u);
1135        assert_eq!(u, cow);
1136
1137        let boxed: Box<str> = u.into();
1138        assert_eq!(boxed, u);
1139    }
1140}
1141
1142lazy_static::lazy_static! {
1143    static ref STRING_CACHE: Bins = {
1144        use std::mem::{self, MaybeUninit};
1145        // This deeply unsafe feeling dance allows us to initialize an array of
1146        // arbitrary size and will have to tide us over until const generics
1147        // land. See:
1148        // https://doc.rust-lang.org/beta/std/mem/union.MaybeUninit.html#initializing-an-array-element-by-element
1149
1150        // Create an uninitialized array of `MaybeUninit`. The `assume_init` is
1151        // safe because the type we are claiming to have initialized here is a
1152        // bunch of `MaybeUninit`s, which do not require initialization.
1153        let mut bins: [MaybeUninit<Mutex<StringCache>>; NUM_BINS] = unsafe {
1154            MaybeUninit::uninit().assume_init()
1155        };
1156
1157        // Dropping a `MaybeUninit` does nothing. Thus using raw pointer
1158        // assignment instead of `ptr::write` does not cause the old
1159        // uninitialized value to be dropped. Also if there is a panic during
1160        // this loop, we have a memory leak, but there is no memory safety
1161        // issue.
1162        for bin in &mut bins[..] {
1163            *bin = MaybeUninit::new(Mutex::new(StringCache::default()));
1164        }
1165
1166        // Everything is initialized. Transmute the array to the
1167        // initialized type.
1168        unsafe { mem::transmute::<_, Bins>(bins) }
1169    };
1170}
1171
1172// Use the top bits of the hash to choose a bin
1173#[inline]
1174fn whichbin(hash: u64) -> usize {
1175    ((hash >> TOP_SHIFT as u64) % NUM_BINS as u64) as usize
1176}