ijson/
string.rs

1//! Functionality relating to the JSON string type
2
3use std::alloc::{alloc, dealloc, Layout, LayoutError};
4use std::borrow::Borrow;
5use std::cmp::Ordering;
6use std::fmt::{self, Debug, Formatter};
7use std::hash::Hash;
8use std::ops::Deref;
9use std::ptr::{copy_nonoverlapping, NonNull};
10use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering};
11
12use dashmap::{DashSet, SharedValue};
13use lazy_static::lazy_static;
14
15use crate::thin::{ThinMut, ThinMutExt, ThinRef, ThinRefExt};
16
17use super::value::{IValue, TypeTag};
18
19#[repr(C)]
20#[repr(align(4))]
21struct Header {
22    rc: AtomicUsize,
23    // We use 48 bits for the length and 16 bits for the shard index.
24    len_lower: u32,
25    len_upper: u16,
26    shard_index: u16,
27}
28
29trait HeaderRef<'a>: ThinRefExt<'a, Header> {
30    fn len(&self) -> usize {
31        (u64::from(self.len_lower) | (u64::from(self.len_upper) << 32)) as usize
32    }
33    fn shard_index(&self) -> usize {
34        self.shard_index as usize
35    }
36    fn str_ptr(&self) -> *const u8 {
37        // Safety: pointers to the end of structs are allowed
38        unsafe { self.ptr().add(1).cast() }
39    }
40    fn bytes(&self) -> &'a [u8] {
41        // Safety: Header `len` must be accurate
42        unsafe { std::slice::from_raw_parts(self.str_ptr(), self.len()) }
43    }
44    fn str(&self) -> &'a str {
45        // Safety: UTF-8 enforced on construction
46        unsafe { std::str::from_utf8_unchecked(self.bytes()) }
47    }
48}
49
50trait HeaderMut<'a>: ThinMutExt<'a, Header> {
51    fn str_ptr_mut(mut self) -> *mut u8 {
52        // Safety: pointers to the end of structs are allowed
53        unsafe { self.ptr_mut().add(1).cast() }
54    }
55}
56
57impl<'a, T: ThinRefExt<'a, Header>> HeaderRef<'a> for T {}
58impl<'a, T: ThinMutExt<'a, Header>> HeaderMut<'a> for T {}
59
60lazy_static! {
61    static ref STRING_CACHE: DashSet<WeakIString> = DashSet::new();
62}
63
64// Eagerly initialize the string cache during tests or when the
65// `ctor` feature is enabled.
66#[cfg(any(test, feature = "ctor"))]
67#[ctor::ctor]
68fn ctor_init_cache() {
69    lazy_static::initialize(&STRING_CACHE);
70}
71
72#[doc(hidden)]
73pub fn init_cache() {
74    lazy_static::initialize(&STRING_CACHE);
75}
76
77struct WeakIString {
78    ptr: NonNull<Header>,
79}
80
81unsafe impl Send for WeakIString {}
82unsafe impl Sync for WeakIString {}
83impl PartialEq for WeakIString {
84    fn eq(&self, other: &Self) -> bool {
85        **self == **other
86    }
87}
88impl Eq for WeakIString {}
89impl Hash for WeakIString {
90    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
91        (**self).hash(state);
92    }
93}
94
95impl Deref for WeakIString {
96    type Target = str;
97    fn deref(&self) -> &str {
98        self.borrow()
99    }
100}
101
102impl Borrow<str> for WeakIString {
103    fn borrow(&self) -> &str {
104        self.header().str()
105    }
106}
107impl WeakIString {
108    fn header(&self) -> ThinRef<Header> {
109        // Safety: pointer is always valid
110        unsafe { ThinRef::new(self.ptr.as_ptr()) }
111    }
112    fn upgrade(&self) -> IString {
113        unsafe {
114            self.ptr.as_ref().rc.fetch_add(1, AtomicOrdering::Relaxed);
115            IString(IValue::new_ptr(
116                self.ptr.as_ptr().cast::<u8>(),
117                TypeTag::StringOrNull,
118            ))
119        }
120    }
121}
122
123/// The `IString` type is an interned, immutable string, and is where this crate
124/// gets its name.
125///
126/// Cloning an `IString` is cheap, and it can be easily converted from `&str` or
127/// `String` types. Comparisons between `IString`s is a simple pointer
128/// comparison.
129///
130/// The memory backing an `IString` is reference counted, so that unlike many
131/// string interning libraries, memory is not leaked as new strings are interned.
132/// Interning uses `DashSet`, an implementation of a concurrent hash-set, allowing
133/// many strings to be interned concurrently without becoming a bottleneck.
134///
135/// Given the nature of `IString` it is better to intern a string once and reuse
136/// it, rather than continually convert from `&str` to `IString`.
137#[repr(transparent)]
138#[derive(Clone)]
139pub struct IString(pub(crate) IValue);
140
141value_subtype_impls!(IString, into_string, as_string, as_string_mut);
142
143static EMPTY_HEADER: Header = Header {
144    len_lower: 0,
145    len_upper: 0,
146    shard_index: 0,
147    rc: AtomicUsize::new(0),
148};
149
150impl IString {
151    fn layout(len: usize) -> Result<Layout, LayoutError> {
152        Ok(Layout::new::<Header>()
153            .extend(Layout::array::<u8>(len)?)?
154            .0
155            .pad_to_align())
156    }
157
158    fn alloc(s: &str, shard_index: usize) -> *mut Header {
159        assert!((s.len() as u64) < (1 << 48));
160        assert!(shard_index < (1 << 16));
161        unsafe {
162            let ptr = alloc(Self::layout(s.len()).unwrap()).cast::<Header>();
163            ptr.write(Header {
164                len_lower: s.len() as u32,
165                len_upper: ((s.len() as u64) >> 32) as u16,
166                shard_index: shard_index as u16,
167                rc: AtomicUsize::new(0),
168            });
169            let hd = ThinMut::new(ptr);
170            copy_nonoverlapping(s.as_ptr(), hd.str_ptr_mut(), s.len());
171            ptr
172        }
173    }
174
175    fn dealloc(ptr: *mut Header) {
176        unsafe {
177            let hd = ThinRef::new(ptr);
178            let layout = Self::layout(hd.len()).unwrap();
179            dealloc(ptr.cast::<u8>(), layout);
180        }
181    }
182
183    /// Converts a `&str` to an `IString` by interning it in the global string cache.
184    #[must_use]
185    pub fn intern(s: &str) -> Self {
186        if s.is_empty() {
187            return Self::new();
188        }
189        let cache = &*STRING_CACHE;
190        let shard_index = cache.determine_map(s);
191
192        // Safety: `determine_map` should only return valid shard indices
193        let shard = unsafe { cache.shards().get_unchecked(shard_index) };
194        let mut guard = shard.write();
195        if let Some((k, _)) = guard.get_key_value(s) {
196            k.upgrade()
197        } else {
198            let k = unsafe {
199                WeakIString {
200                    ptr: NonNull::new_unchecked(Self::alloc(s, shard_index)),
201                }
202            };
203            let res = k.upgrade();
204            guard.insert(k, SharedValue::new(()));
205            res
206        }
207    }
208
209    fn header(&self) -> ThinRef<Header> {
210        unsafe { ThinRef::new(self.0.ptr().cast()) }
211    }
212
213    /// Returns the length (in bytes) of this string.
214    #[must_use]
215    pub fn len(&self) -> usize {
216        self.header().len()
217    }
218
219    /// Returns `true` if this is the empty string "".
220    #[must_use]
221    pub fn is_empty(&self) -> bool {
222        self.len() == 0
223    }
224
225    /// Obtains a `&str` from this `IString`. This is a cheap operation.
226    #[must_use]
227    pub fn as_str(&self) -> &str {
228        self.header().str()
229    }
230
231    /// Obtains a byte slice from this `IString`. This is a cheap operation.
232    #[must_use]
233    pub fn as_bytes(&self) -> &[u8] {
234        self.header().bytes()
235    }
236
237    /// Returns the empty string.
238    #[must_use]
239    pub fn new() -> Self {
240        unsafe { IString(IValue::new_ref(&EMPTY_HEADER, TypeTag::StringOrNull)) }
241    }
242
243    pub(crate) fn clone_impl(&self) -> IValue {
244        if self.is_empty() {
245            Self::new().0
246        } else {
247            self.header().rc.fetch_add(1, AtomicOrdering::Relaxed);
248            unsafe { self.0.raw_copy() }
249        }
250    }
251    pub(crate) fn drop_impl(&mut self) {
252        if !self.is_empty() {
253            let hd = self.header();
254
255            // If the reference count is greater than 1, we can safely decrement it without
256            // locking the string cache.
257            let mut rc = hd.rc.load(AtomicOrdering::Relaxed);
258            while rc > 1 {
259                match hd.rc.compare_exchange_weak(
260                    rc,
261                    rc - 1,
262                    AtomicOrdering::Relaxed,
263                    AtomicOrdering::Relaxed,
264                ) {
265                    Ok(_) => return,
266                    Err(new_rc) => rc = new_rc,
267                }
268            }
269
270            // Slow path: we observed a reference count of 1, so we need to lock the string cache
271            let cache = &*STRING_CACHE;
272            // Safety: the number of shards is fixed
273            let shard = unsafe { cache.shards().get_unchecked(hd.shard_index()) };
274            let mut guard = shard.write();
275            if hd.rc.fetch_sub(1, AtomicOrdering::Relaxed) == 1 {
276                // Reference count reached zero, free the string
277                assert!(guard.remove(hd.str()).is_some());
278
279                // Shrink the shard if it's mostly empty.
280                // The second condition is necessary because `HashMap` sometimes
281                // reports a capacity of zero even when it's still backed by an
282                // allocation.
283                if guard.len() * 3 < guard.capacity() || guard.is_empty() {
284                    guard.shrink_to_fit();
285                }
286                drop(guard);
287
288                Self::dealloc(unsafe { self.0.ptr().cast() });
289            }
290        }
291    }
292}
293
294impl Deref for IString {
295    type Target = str;
296    fn deref(&self) -> &str {
297        self.as_str()
298    }
299}
300
301impl Borrow<str> for IString {
302    fn borrow(&self) -> &str {
303        self.as_str()
304    }
305}
306
307impl From<&str> for IString {
308    fn from(other: &str) -> Self {
309        Self::intern(other)
310    }
311}
312
313impl From<&mut str> for IString {
314    fn from(other: &mut str) -> Self {
315        Self::intern(other)
316    }
317}
318
319impl From<String> for IString {
320    fn from(other: String) -> Self {
321        Self::intern(other.as_str())
322    }
323}
324
325impl From<&String> for IString {
326    fn from(other: &String) -> Self {
327        Self::intern(other.as_str())
328    }
329}
330
331impl From<&mut String> for IString {
332    fn from(other: &mut String) -> Self {
333        Self::intern(other.as_str())
334    }
335}
336
337impl From<IString> for String {
338    fn from(other: IString) -> Self {
339        other.as_str().into()
340    }
341}
342
343impl PartialEq for IString {
344    fn eq(&self, other: &Self) -> bool {
345        self.0.raw_eq(&other.0)
346    }
347}
348
349impl PartialEq<str> for IString {
350    fn eq(&self, other: &str) -> bool {
351        self.as_str() == other
352    }
353}
354
355impl PartialEq<IString> for str {
356    fn eq(&self, other: &IString) -> bool {
357        self == other.as_str()
358    }
359}
360
361impl PartialEq<String> for IString {
362    fn eq(&self, other: &String) -> bool {
363        self.as_str() == other
364    }
365}
366
367impl PartialEq<IString> for String {
368    fn eq(&self, other: &IString) -> bool {
369        self == other.as_str()
370    }
371}
372
373impl Default for IString {
374    fn default() -> Self {
375        Self::new()
376    }
377}
378
379impl Eq for IString {}
380impl Ord for IString {
381    fn cmp(&self, other: &Self) -> Ordering {
382        if self == other {
383            Ordering::Equal
384        } else {
385            self.as_str().cmp(other.as_str())
386        }
387    }
388}
389impl PartialOrd for IString {
390    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
391        Some(self.cmp(other))
392    }
393}
394impl Hash for IString {
395    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
396        self.0.raw_hash(state);
397    }
398}
399
400impl Debug for IString {
401    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
402        Debug::fmt(self.as_str(), f)
403    }
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409
410    #[mockalloc::test]
411    fn can_intern() {
412        let x = IString::intern("foo");
413        let y = IString::intern("bar");
414        let z = IString::intern("foo");
415
416        assert_eq!(x.as_ptr(), z.as_ptr());
417        assert_ne!(x.as_ptr(), y.as_ptr());
418        assert_eq!(x.as_str(), "foo");
419        assert_eq!(y.as_str(), "bar");
420    }
421
422    #[mockalloc::test]
423    fn default_interns_string() {
424        let x = IString::intern("");
425        let y = IString::new();
426        let z = IString::intern("foo");
427
428        assert_eq!(x.as_ptr(), y.as_ptr());
429        assert_ne!(x.as_ptr(), z.as_ptr());
430    }
431}