Skip to main content

cold_string/
lib.rs

1#![allow(rustdoc::bare_urls)]
2#![doc = include_str!("../README.md")]
3#![allow(unstable_name_collisions)]
4#![no_std]
5
6extern crate alloc;
7
8#[rustversion::before(1.84)]
9use sptr::Strict;
10
11use alloc::{
12    alloc::{alloc, dealloc, Layout},
13    borrow::{Cow, ToOwned},
14    boxed::Box,
15    str::Utf8Error,
16    string::String,
17};
18use core::{
19    cmp::Ordering,
20    fmt,
21    hash::{Hash, Hasher},
22    iter::FromIterator,
23    mem,
24    ops::Deref,
25    ptr, slice, str,
26};
27
28mod vint;
29use crate::vint::VarInt;
30
31#[cfg(feature = "rkyv")]
32mod rkyv;
33
34const HEAP_ALIGN: usize = 4;
35const WIDTH: usize = mem::size_of::<usize>();
36
37/// Compact representation of immutable UTF-8 strings. Optimized for memory usage and struct packing.
38///
39/// # Example
40/// ```
41/// let s = cold_string::ColdString::new("qwerty");
42/// assert_eq!(s.as_str(), "qwerty");
43/// ```
44/// ```
45/// use core::mem;
46/// use cold_string::ColdString;
47///
48/// assert_eq!(mem::size_of::<ColdString>(), mem::size_of::<usize>());
49/// assert_eq!(mem::align_of::<ColdString>(), 1);
50/// assert_eq!(mem::size_of::<(ColdString, u8)>(), mem::size_of::<usize>() + 1);
51/// assert_eq!(mem::align_of::<(ColdString, u8)>(), 1);
52/// ```
53#[repr(packed)]
54pub struct ColdString {
55    /// The first byte of `encoded` is the "tag" and it determines the type:
56    /// - 10xxxxxx: an encoded address for the heap. To decode, 10 is set to 00 and swapped
57    ///   with the LSB bits of the tag byte. The address is always a multiple of 4 (`HEAP_ALIGN`).
58    /// - 11111xxx: xxx is the length in range 0..=7, followed by length UTF-8 bytes.
59    /// - xxxxxxxx (valid UTF-8): 8 UTF-8 bytes.
60    encoded: *const u8,
61}
62
63impl ColdString {
64    const TAG_MASK: usize = usize::from_ne_bytes(0b11000000usize.to_le_bytes());
65    const INLINE_TAG: usize = usize::from_ne_bytes(0b11111000usize.to_le_bytes());
66    const PTR_TAG: usize = usize::from_ne_bytes(0b10000000usize.to_le_bytes());
67    const LEN_MASK: usize = usize::from_ne_bytes(0b111usize.to_le_bytes());
68    const ROT: u32 = if cfg!(target_endian = "little") {
69        0
70    } else {
71        8 * (WIDTH - 1) as u32
72    };
73
74    /// Convert a slice of bytes into a [`ColdString`].
75    ///
76    /// A [`ColdString`] is a contiguous collection of bytes (`u8`s) that is valid [`UTF-8`](https://en.wikipedia.org/wiki/UTF-8).
77    /// This method converts from an arbitrary contiguous collection of bytes into a
78    /// [`ColdString`], failing if the provided bytes are not `UTF-8`.
79    ///
80    /// # Examples
81    /// ### Valid UTF-8
82    /// ```
83    /// # use cold_string::ColdString;
84    /// let bytes = [240, 159, 166, 128, 240, 159, 146, 175];
85    /// let compact = ColdString::from_utf8(&bytes).expect("valid UTF-8");
86    ///
87    /// assert_eq!(compact, "🦀💯");
88    /// ```
89    ///
90    /// ### Invalid UTF-8
91    /// ```
92    /// # use cold_string::ColdString;
93    /// let bytes = [255, 255, 255];
94    /// let result = ColdString::from_utf8(&bytes);
95    ///
96    /// assert!(result.is_err());
97    /// ```
98    pub fn from_utf8<B: AsRef<[u8]>>(v: B) -> Result<Self, Utf8Error> {
99        Ok(Self::new(str::from_utf8(v.as_ref())?))
100    }
101
102    /// Converts a vector of bytes to a [`ColdString`] without checking that the string contains
103    /// valid UTF-8.
104    ///
105    /// See the safe version, [`ColdString::from_utf8`], for more details.
106    ///
107    /// # Examples
108    ///
109    /// Basic usage:
110    ///
111    /// ```
112    /// # use cold_string::ColdString;
113    /// // some bytes, in a vector
114    /// let sparkle_heart = [240, 159, 146, 150];
115    ///
116    /// let sparkle_heart = unsafe {
117    ///     ColdString::from_utf8_unchecked(&sparkle_heart)
118    /// };
119    ///
120    /// assert_eq!("💖", sparkle_heart);
121    /// ```
122    pub unsafe fn from_utf8_unchecked<B: AsRef<[u8]>>(v: B) -> Self {
123        Self::new(str::from_utf8_unchecked(v.as_ref()))
124    }
125
126    /// Creates a new [`ColdString`] from any type that implements `AsRef<str>`.
127    /// If the string is shorter than `core::mem::size_of::<usize>()`, then it
128    /// will be inlined on the stack.
129    pub fn new<T: AsRef<str>>(x: T) -> Self {
130        let s = x.as_ref();
131        if s.len() <= WIDTH {
132            Self::new_inline(s)
133        } else {
134            Self::new_heap(s)
135        }
136    }
137
138    #[inline]
139    const fn inline_buf(s: &str) -> [u8; WIDTH] {
140        debug_assert!(s.len() <= WIDTH);
141        let mut buf = [0u8; WIDTH];
142        if s.len() < WIDTH {
143            let tag =
144                (Self::INLINE_TAG | s.len().rotate_left(Self::ROT)).rotate_right(Self::ROT) as u8;
145            buf[0] = tag;
146        }
147        buf
148    }
149
150    #[rustversion::attr(since(1.61), const)]
151    #[inline]
152    fn from_inline_buf(b: [u8; WIDTH]) -> Self {
153        let encoded = ptr::null_mut::<u8>().wrapping_add(usize::from_ne_bytes(b));
154        Self { encoded }
155    }
156
157    #[inline]
158    const fn utf8_start(l: usize) -> usize {
159        (l < WIDTH) as usize
160    }
161
162    #[inline]
163    fn new_inline(s: &str) -> Self {
164        let mut buf = Self::inline_buf(s);
165        let start = Self::utf8_start(s.len());
166        buf[start..s.len() + start].copy_from_slice(s.as_bytes());
167        Self::from_inline_buf(buf)
168    }
169
170    /// Creates a new inline [`ColdString`] from `&'static str` at compile time.
171    ///
172    /// In a dynamic context you can use the method [`ColdString::new()`].
173    ///
174    /// # Panics
175    /// The string must be less than `core::mem::size_of::<usize>()`. Creating
176    /// a [`ColdString`] larger than that is not supported.
177    ///
178    ///
179    /// # Examples
180    /// ```
181    /// use cold_string::ColdString;
182    ///
183    /// const DEFAULT_NAME: ColdString = ColdString::new_inline_const("cold");
184    /// ```
185    #[rustversion::since(1.61)]
186    #[inline]
187    pub const fn new_inline_const(s: &str) -> Self {
188        if s.len() > WIDTH {
189            panic!(
190                "Length for `new_inline_const` must be less than `core::mem::size_of::<usize>()`."
191            );
192        }
193        let mut buf = Self::inline_buf(s);
194        let start = Self::utf8_start(s.len());
195        let mut i = 0;
196        while i < s.len() {
197            buf[i + start] = s.as_bytes()[i];
198            i += 1;
199        }
200        Self::from_inline_buf(buf)
201    }
202
203    #[rustversion::attr(since(1.71), const)]
204    #[inline]
205    unsafe fn ptr(&self) -> *const u8 {
206        ptr::read_unaligned(ptr::addr_of!(self.encoded))
207    }
208
209    #[inline]
210    fn addr(&self) -> usize {
211        unsafe { self.ptr().addr() }
212    }
213
214    #[inline]
215    fn tag(&self) -> usize {
216        self.addr() & Self::TAG_MASK
217    }
218
219    /// Returns `true` if the string bytes are inlined.
220    #[inline]
221    pub fn is_inline(&self) -> bool {
222        self.tag() != Self::PTR_TAG
223    }
224
225    #[inline]
226    fn new_heap(s: &str) -> Self {
227        let len = s.len();
228        let (vint_len, len_buf) = VarInt::write(len as u64);
229        let total = vint_len + len;
230        let layout = Layout::from_size_align(total, HEAP_ALIGN).unwrap();
231
232        unsafe {
233            let ptr = alloc(layout);
234            if ptr.is_null() {
235                alloc::alloc::handle_alloc_error(layout);
236            }
237
238            // TODO: can optimize this
239            ptr::copy_nonoverlapping(len_buf.as_ptr(), ptr, vint_len);
240            ptr::copy_nonoverlapping(s.as_ptr(), ptr.add(vint_len), len);
241            let encoded = ptr.map_addr(|addr| {
242                debug_assert!(addr % HEAP_ALIGN == 0);
243                let mut addr = addr.rotate_left(6 + Self::ROT);
244                addr |= Self::PTR_TAG;
245                addr
246            });
247            Self { encoded }
248        }
249    }
250
251    #[inline]
252    fn heap_ptr(&self) -> *const u8 {
253        debug_assert!(!self.is_inline());
254        unsafe {
255            self.ptr().map_addr(|mut addr| {
256                addr ^= Self::PTR_TAG;
257                let addr = addr.rotate_right(6 + Self::ROT);
258                debug_assert!(addr % HEAP_ALIGN == 0);
259                addr
260            })
261        }
262    }
263
264    #[inline]
265    fn inline_len(&self) -> usize {
266        let addr = self.addr();
267        match addr & Self::INLINE_TAG {
268            Self::INLINE_TAG => (addr & Self::LEN_MASK).rotate_right(Self::ROT),
269            _ => WIDTH,
270        }
271    }
272
273    /// Returns the length of this `ColdString`, in bytes, not [`char`]s or
274    /// graphemes. In other words, it might not be what a human considers the
275    /// length of the string.
276    ///
277    /// # Examples
278    ///
279    /// ```
280    /// use cold_string::ColdString;
281    ///
282    /// let a = ColdString::from("foo");
283    /// assert_eq!(a.len(), 3);
284    ///
285    /// let fancy_f = String::from("ƒoo");
286    /// assert_eq!(fancy_f.len(), 4);
287    /// assert_eq!(fancy_f.chars().count(), 3);
288    /// ```
289    #[inline]
290    pub fn len(&self) -> usize {
291        if self.is_inline() {
292            self.inline_len()
293        } else {
294            unsafe {
295                let ptr = self.heap_ptr();
296                let (len, _) = VarInt::read(ptr);
297                len as usize
298            }
299        }
300    }
301
302    #[allow(unsafe_op_in_unsafe_fn)]
303    #[inline]
304    unsafe fn decode_inline(&self) -> &[u8] {
305        let len = self.inline_len();
306        // SAFETY: addr_of! avoids &self.ptr (which is UB due to alignment)
307        let self_bytes_ptr = ptr::addr_of!(self.encoded) as *const u8;
308        let start = Self::utf8_start(len);
309        slice::from_raw_parts(self_bytes_ptr.add(start), len)
310    }
311
312    #[allow(unsafe_op_in_unsafe_fn)]
313    #[inline]
314    unsafe fn decode_heap(&self) -> &[u8] {
315        let ptr = self.heap_ptr();
316        let (len, header) = VarInt::read(ptr);
317        let data = ptr.add(header);
318        slice::from_raw_parts(data, len)
319    }
320
321    /// Returns a byte slice of this `ColdString`'s contents.
322    ///
323    /// The inverse of this method is [`from_utf8`].
324    ///
325    /// [`from_utf8`]: String::from_utf8
326    ///
327    /// # Examples
328    ///
329    /// ```
330    /// let s = cold_string::ColdString::from("hello");
331    ///
332    /// assert_eq!(&[104, 101, 108, 108, 111], s.as_bytes());
333    /// ```
334    #[inline]
335    pub fn as_bytes(&self) -> &[u8] {
336        match self.is_inline() {
337            true => unsafe { self.decode_inline() },
338            false => unsafe { self.decode_heap() },
339        }
340    }
341
342    /// Returns a string slice containing the entire [`ColdString`].
343    ///
344    /// # Examples
345    /// ```
346    /// let s = cold_string::ColdString::new("hello");
347    ///
348    /// assert_eq!(s.as_str(), "hello");
349    /// ```
350    #[inline]
351    pub fn as_str(&self) -> &str {
352        unsafe { str::from_utf8_unchecked(self.as_bytes()) }
353    }
354
355    /// Returns `true` if this `ColdString` has a length of zero, and `false` otherwise.
356    ///
357    /// # Examples
358    ///
359    /// ```
360    /// let v = cold_string::ColdString::new("");
361    /// assert!(v.is_empty());
362    /// ```
363    #[inline]
364    pub fn is_empty(&self) -> bool {
365        self.len() == 0
366    }
367}
368
369impl Default for ColdString {
370    fn default() -> Self {
371        Self::new_inline("")
372    }
373}
374
375impl Deref for ColdString {
376    type Target = str;
377    fn deref(&self) -> &str {
378        self.as_str()
379    }
380}
381
382impl Drop for ColdString {
383    fn drop(&mut self) {
384        if !self.is_inline() {
385            unsafe {
386                let ptr = self.heap_ptr();
387                let (len, header) = VarInt::read(ptr);
388                let total = header + len;
389                let layout = Layout::from_size_align(total, HEAP_ALIGN).unwrap();
390                dealloc(ptr as *mut u8, layout);
391            }
392        }
393    }
394}
395
396impl Clone for ColdString {
397    fn clone(&self) -> Self {
398        match self.is_inline() {
399            true => unsafe {
400                Self {
401                    encoded: self.ptr(),
402                }
403            },
404            false => Self::new_heap(self.as_str()),
405        }
406    }
407}
408
409impl PartialEq for ColdString {
410    fn eq(&self, other: &Self) -> bool {
411        match (self.is_inline(), other.is_inline()) {
412            (true, true) => unsafe { self.ptr() == other.ptr() },
413            (false, false) => unsafe { self.decode_heap() == other.decode_heap() },
414            _ => false,
415        }
416    }
417}
418
419impl Eq for ColdString {}
420
421impl Hash for ColdString {
422    fn hash<H: Hasher>(&self, state: &mut H) {
423        self.as_str().hash(state)
424    }
425}
426
427impl fmt::Debug for ColdString {
428    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
429        fmt::Debug::fmt(self.as_str(), f)
430    }
431}
432
433impl fmt::Display for ColdString {
434    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
435        fmt::Display::fmt(self.as_str(), f)
436    }
437}
438
439impl From<&str> for ColdString {
440    fn from(s: &str) -> Self {
441        Self::new(s)
442    }
443}
444
445impl From<String> for ColdString {
446    fn from(s: String) -> Self {
447        Self::new(&s)
448    }
449}
450
451impl From<ColdString> for String {
452    fn from(s: ColdString) -> Self {
453        s.as_str().to_owned()
454    }
455}
456
457impl From<ColdString> for Cow<'_, str> {
458    #[inline]
459    fn from(s: ColdString) -> Self {
460        Self::Owned(s.into())
461    }
462}
463
464impl<'a> From<&'a ColdString> for Cow<'a, str> {
465    #[inline]
466    fn from(s: &'a ColdString) -> Self {
467        Self::Borrowed(s)
468    }
469}
470
471impl<'a> From<Cow<'a, str>> for ColdString {
472    fn from(cow: Cow<'a, str>) -> Self {
473        match cow {
474            Cow::Borrowed(s) => s.into(),
475            Cow::Owned(s) => s.into(),
476        }
477    }
478}
479
480impl From<Box<str>> for ColdString {
481    #[inline]
482    #[track_caller]
483    fn from(b: Box<str>) -> Self {
484        Self::new(&b)
485    }
486}
487
488impl FromIterator<char> for ColdString {
489    fn from_iter<I: IntoIterator<Item = char>>(iter: I) -> Self {
490        let s: String = iter.into_iter().collect();
491        ColdString::new(&s)
492    }
493}
494
495unsafe impl Send for ColdString {}
496unsafe impl Sync for ColdString {}
497
498impl core::borrow::Borrow<str> for ColdString {
499    fn borrow(&self) -> &str {
500        self.as_str()
501    }
502}
503
504impl PartialEq<str> for ColdString {
505    fn eq(&self, other: &str) -> bool {
506        if self.is_inline() {
507            unsafe { self.decode_inline() == other.as_bytes() }
508        } else {
509            unsafe { self.decode_heap() == other.as_bytes() }
510        }
511    }
512}
513
514impl PartialEq<ColdString> for str {
515    fn eq(&self, other: &ColdString) -> bool {
516        other.eq(self)
517    }
518}
519
520impl PartialEq<&str> for ColdString {
521    fn eq(&self, other: &&str) -> bool {
522        self.eq(*other)
523    }
524}
525
526impl PartialEq<ColdString> for &str {
527    fn eq(&self, other: &ColdString) -> bool {
528        other.eq(*self)
529    }
530}
531
532impl AsRef<str> for ColdString {
533    #[inline]
534    fn as_ref(&self) -> &str {
535        self.as_str()
536    }
537}
538
539impl AsRef<[u8]> for ColdString {
540    #[inline]
541    fn as_ref(&self) -> &[u8] {
542        self.as_bytes()
543    }
544}
545
546impl Ord for ColdString {
547    fn cmp(&self, other: &Self) -> Ordering {
548        self.as_str().cmp(other.as_str())
549    }
550}
551
552impl PartialOrd for ColdString {
553    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
554        self.as_str().partial_cmp(other.as_str())
555    }
556}
557
558impl alloc::str::FromStr for ColdString {
559    type Err = core::convert::Infallible;
560    fn from_str(s: &str) -> Result<ColdString, Self::Err> {
561        Ok(ColdString::new(s))
562    }
563}
564
565#[cfg(feature = "serde")]
566impl serde::Serialize for ColdString {
567    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
568        serializer.serialize_str(self.as_str())
569    }
570}
571
572#[cfg(feature = "serde")]
573impl<'de> serde::Deserialize<'de> for ColdString {
574    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
575        let s = String::deserialize(d)?;
576        Ok(ColdString::new(&s))
577    }
578}
579
580#[cfg(all(test, feature = "serde"))]
581mod serde_tests {
582    use super::*;
583    use serde_test::{assert_tokens, Token};
584
585    #[test]
586    fn test_serde_cold_string_inline() {
587        let cs = ColdString::new("ferris");
588        assert_tokens(&cs, &[Token::Str("ferris")]);
589    }
590
591    #[test]
592    fn test_serde_cold_string_heap() {
593        let long_str = "This is a significantly longer string for heap testing";
594        let cs = ColdString::new(long_str);
595        assert_tokens(&cs, &[Token::Str(long_str)]);
596    }
597}
598
599#[cfg(test)]
600mod tests {
601    use super::*;
602    use core::hash::BuildHasher;
603    use hashbrown::hash_map::DefaultHashBuilder;
604
605    #[test]
606    fn test_layout() {
607        assert_eq!(mem::size_of::<ColdString>(), mem::size_of::<usize>());
608        assert_eq!(mem::align_of::<ColdString>(), 1);
609        struct Foo {
610            _s: ColdString,
611            _b: u8,
612        }
613
614        assert_eq!(mem::size_of::<Foo>(), mem::size_of::<usize>() + 1);
615        assert_eq!(mem::align_of::<Foo>(), 1);
616    }
617
618    #[test]
619    fn test_default() {
620        assert!(ColdString::default().is_empty());
621        assert_eq!(ColdString::default().len(), 0);
622        assert_eq!(ColdString::default(), "");
623        assert_eq!(ColdString::default(), ColdString::new(""));
624    }
625
626    fn assert_correct(s: &str) {
627        let cs = ColdString::new(s);
628        assert_eq!(s.len() <= mem::size_of::<usize>(), cs.is_inline());
629        assert_eq!(cs.len(), s.len());
630        assert_eq!(cs.as_bytes(), s.as_bytes());
631        assert_eq!(cs.as_str(), s);
632        assert_eq!(cs.clone(), cs);
633        let bh = DefaultHashBuilder::new();
634        let mut hasher1 = bh.build_hasher();
635        cs.hash(&mut hasher1);
636        let mut hasher2 = bh.build_hasher();
637        cs.clone().hash(&mut hasher2);
638        assert_eq!(hasher1.finish(), hasher2.finish());
639        assert_eq!(cs, s);
640        assert_eq!(s, cs);
641        assert_eq!(cs, *s);
642        assert_eq!(*s, cs);
643    }
644
645    #[test]
646    fn it_works() {
647        for s in [
648            "1",
649            "12",
650            "123",
651            "1234",
652            "12345",
653            "123456",
654            "1234567",
655            "12345678",
656            "123456789",
657            str::from_utf8(&[240, 159, 146, 150]).unwrap(),
658            "✅",
659            "❤️",
660            "🦀💯",
661            "🦀",
662            "💯",
663            "abcd",
664            "test",
665            "",
666            "\0",
667            "\0\0",
668            "\0\0\0",
669            "\0\0\0\0",
670            "\0\0\0\0\0\0\0",
671            "\0\0\0\0\0\0\0\0",
672            "1234567",
673            "12345678",
674            "longer test",
675            str::from_utf8(&[103, 39, 240, 145, 167, 156, 194, 165]).unwrap(),
676            "AaAa0 ® ",
677            str::from_utf8(&[240, 158, 186, 128, 240, 145, 143, 151]).unwrap(),
678        ] {
679            assert_correct(s);
680        }
681    }
682
683    fn char_from_leading_byte(b: u8) -> Option<char> {
684        match b {
685            0x00..=0x7F => Some(b as char),
686            0xC2..=0xDF => str::from_utf8(&[b, 0x91]).unwrap().chars().next(),
687            0xE0 => str::from_utf8(&[b, 0xA0, 0x91]).unwrap().chars().next(),
688            0xE1..=0xEC | 0xEE..=0xEF => str::from_utf8(&[b, 0x91, 0xA5]).unwrap().chars().next(),
689            0xED => str::from_utf8(&[b, 0x80, 0x91]).unwrap().chars().next(),
690            0xF0 => str::from_utf8(&[b, 0x90, 0x91, 0xA5])
691                .unwrap()
692                .chars()
693                .next(),
694            0xF1..=0xF3 => str::from_utf8(&[b, 0x91, 0xA5, 0x82])
695                .unwrap()
696                .chars()
697                .next(),
698            0xF4 => str::from_utf8(&[b, 0x80, 0x91, 0x82])
699                .unwrap()
700                .chars()
701                .next(),
702            _ => None,
703        }
704    }
705
706    #[test]
707    fn test_edges() {
708        let width = mem::size_of::<usize>();
709        for len in [width - 1, width, width + 1] {
710            for first_byte in 0u8..=255 {
711                let first_char = match char_from_leading_byte(first_byte) {
712                    Some(c) => c,
713                    None => continue,
714                };
715
716                let mut s = String::with_capacity(len);
717                s.push(first_char);
718
719                while s.len() < len {
720                    let c = core::char::from_digit((len - s.len()) as u32, 10).unwrap();
721                    s.push(c);
722                }
723
724                assert_correct(&s);
725            }
726        }
727    }
728
729    #[test]
730    fn test_unaligned_placement() {
731        for s_content in ["torture", "tor", "tortures", "tort", "torture torture"] {
732            let mut buffer = [0u8; 32];
733            for offset in 0..8 {
734                unsafe {
735                    let dst = buffer.as_mut_ptr().add(offset) as *mut ColdString;
736                    let s = ColdString::new(s_content);
737                    ptr::write_unaligned(dst, s);
738                    let recovered = ptr::read_unaligned(dst);
739                    assert_eq!(recovered.as_str(), s_content);
740                }
741            }
742        }
743    }
744}