Skip to main content

utf8_bytes/
bytes.rs

1use crate::FromUtf8Error;
2
3use super::Utf8BytesMut;
4
5use core::iter::FromIterator;
6use core::ops::{Deref, RangeBounds};
7use core::{cmp, fmt, hash};
8use std::borrow::Cow;
9
10use alloc::{borrow::Borrow, boxed::Box, string::String, vec::Vec};
11
12/// A cheaply cloneable and sliceable chunk of contiguous memory filled with
13/// UTF-8 bytes.
14///
15/// This is built on [`Bytes`](bytes::Bytes), see its documentation for more.
16#[repr(transparent)]
17pub struct Utf8Bytes {
18    /// # Invariant
19    /// - contains UTF-8.
20    #[deprecated = "use the accessors to preserve the invariants"]
21    inner: bytes::Bytes,
22}
23
24impl Utf8Bytes {
25    /// Wrap `bytes` if it is UTF-8.
26    ///
27    /// If it is not, you can perform a lossy conversion using [`FromUtf8Error::into_utf8_lossy`].
28    pub fn from_bytes(bytes: bytes::Bytes) -> Result<Self, FromUtf8Error<bytes::Bytes>> {
29        match str::from_utf8(&bytes) {
30            // SAFETY:
31            // - performed validation
32            Ok(_) => Ok(unsafe { Self::from_bytes_unchecked(bytes) }),
33            Err(error) => Err(FromUtf8Error { bytes, error }),
34        }
35    }
36
37    /// # Safety
38    /// `bytes` must only contain UTF-8.
39    pub const unsafe fn from_bytes_unchecked(bytes: bytes::Bytes) -> Self {
40        #[expect(deprecated)]
41        Self { inner: bytes }
42    }
43
44    /// Get the contents of the buffer.
45    pub fn as_str(&self) -> &str {
46        // SAFETY:
47        // - cannot create Self from invalid UTF-8 without using `unsafe`
48        unsafe { str::from_utf8_unchecked(self.inner()) }
49    }
50}
51
52impl Utf8Bytes {
53    /// Return a shared reference to the inner object.
54    #[inline]
55    pub const fn inner(&self) -> &bytes::Bytes {
56        #[expect(deprecated)]
57        &self.inner
58    }
59
60    /// Return an exclusive reference to the inner object.
61    ///
62    /// # Safety
63    /// - The returned bytes must be returned containing UTF-8
64    #[inline]
65    pub const unsafe fn inner_mut(&mut self) -> &mut bytes::Bytes {
66        #[expect(deprecated)]
67        &mut self.inner
68    }
69    #[inline]
70    pub fn into_inner(self) -> bytes::Bytes {
71        #[expect(deprecated)]
72        self.inner
73    }
74}
75
76impl Utf8Bytes {
77    /// Creates a new empty `Bytes`.
78    ///
79    /// This will not allocate and the returned handle will be empty.
80    ///
81    /// # Examples
82    ///
83    /// ```
84    /// use utf8_bytes::Utf8Bytes;
85    ///
86    /// let b = Utf8Bytes::new();
87    /// assert_eq!(b, "");
88    /// ```
89    #[inline]
90    pub const fn new() -> Self {
91        // SAFETY:
92        // - empty is valid UTF-8
93        unsafe { Self::from_bytes_unchecked(bytes::Bytes::new()) }
94    }
95
96    /// Creates a new [`Utf8Bytes`] from a static slice.
97    ///
98    /// The returned [`Utf8Bytes`] will point directly to the static slice.
99    /// There is no allocating or copying.
100    ///
101    /// # Examples
102    ///
103    /// ```
104    /// use utf8_bytes::Utf8Bytes;
105    ///
106    /// let b = Utf8Bytes::from_static("hello");
107    /// assert_eq!(b, "hello");
108    /// ```
109    #[inline]
110    pub const fn from_static(str: &'static str) -> Self {
111        // SAFETY:
112        // - bytes: &str
113        unsafe { Self::from_bytes_unchecked(bytes::Bytes::from_static(str.as_bytes())) }
114    }
115
116    /// Create [`Utf8Bytes`] with a buffer whose lifetime is controlled
117    /// via an explicit owner.
118    ///
119    /// See [`bytes::Bytes::from_owner`] for more.
120    pub fn from_owner<T>(owner: T) -> Self
121    where
122        T: AsRef<str> + Send + 'static,
123    {
124        #[repr(transparent)]
125        struct AsBytes<T>(T);
126        impl<T: AsRef<str>> AsRef<[u8]> for AsBytes<T> {
127            fn as_ref(&self) -> &[u8] {
128                self.0.as_ref().as_bytes()
129            }
130        }
131        // SAFETY:
132        // - owner: AsRef<str>
133        unsafe { Self::from_bytes_unchecked(bytes::Bytes::from_owner(AsBytes(owner))) }
134    }
135
136    /// Returns the number of bytes contained in this [`Utf8Bytes`].
137    ///
138    /// # Examples
139    ///
140    /// ```
141    /// use utf8_bytes::Utf8Bytes;
142    ///
143    /// let b = Utf8Bytes::from("hello");
144    /// assert_eq!(b.len(), 5);
145    /// ```
146    #[inline]
147    pub const fn len(&self) -> usize {
148        self.inner().len()
149    }
150
151    /// Returns true if the [`Utf8Bytes`] has a length of 0.
152    ///
153    /// # Examples
154    ///
155    /// ```
156    /// use utf8_bytes::Utf8Bytes;
157    ///
158    /// let b = Utf8Bytes::new();
159    /// assert!(b.is_empty());
160    /// ```
161    #[inline]
162    pub const fn is_empty(&self) -> bool {
163        self.inner().is_empty()
164    }
165
166    /// Returns true if this is the only reference to the data and
167    /// <code>[Into]<[Utf8BytesMut]></code> would avoid cloning the underlying
168    /// buffer.
169    ///
170    /// Always returns false if the data is backed by a [static slice](Self::from_static),
171    /// or an [owner](Self::from_owner).
172    ///
173    /// The result of this method may be invalidated immediately if another
174    /// thread clones this value while this is being called. Ensure you have
175    /// unique access to this value (`&mut Bytes`) first if you need to be
176    /// certain the result is valid (i.e. for safety reasons).
177    ///
178    /// # Examples
179    ///
180    /// ```
181    /// use utf8_bytes::Utf8Bytes;
182    ///
183    /// let a = Utf8Bytes::copy_from_str("123");
184    /// assert!(a.is_unique());
185    /// let b = a.clone();
186    /// assert!(!a.is_unique());
187    /// ```
188    pub fn is_unique(&self) -> bool {
189        self.inner().is_unique()
190    }
191
192    /// Creates a [`Utf8Bytes`] instance from slice, by copying it.
193    pub fn copy_from_str(data: &str) -> Self {
194        // SAFETY:
195        // - data: &str
196        unsafe { Self::from_bytes_unchecked(bytes::Bytes::copy_from_slice(data.as_bytes())) }
197    }
198
199    /// Returns a slice of self for the provided range.
200    ///
201    /// This will increment the reference count for the underlying memory and
202    /// return a new [`Utf8Bytes`] handle set to the slice.
203    ///
204    /// This operation is `O(1)`.
205    ///
206    /// # Examples
207    ///
208    /// ```
209    /// use utf8_bytes::Utf8Bytes;
210    ///
211    /// let a = Utf8Bytes::from("hello world");
212    /// let b = a.slice(2..5);
213    ///
214    /// assert_eq!(b, "llo");
215    /// ```
216    ///
217    /// # Panics
218    ///
219    /// - If `range` is out of bounds.
220    /// - `range` breaks a char boundary.
221    ///
222    #[track_caller]
223    pub fn slice(&self, range: impl RangeBounds<usize>) -> Self {
224        let lo = range.start_bound().cloned();
225        let hi = range.end_bound().cloned();
226        self.as_str()
227            .get((lo, hi))
228            .expect("range out of bounds or not on a char boundary");
229        // Safety:
230        // - checked the equivalent operation on &str
231        unsafe { Self::from_bytes_unchecked(self.inner().slice((lo, hi))) }
232    }
233
234    /// Returns a slice of self that is equivalent to the given `subset`.
235    ///
236    /// When processing a [`Utf8Bytes`] buffer with other tools, one often gets
237    /// a `&str` which is in fact a slice of the [`Utf8Bytes`],
238    /// i.e. a subset of it.
239    ///
240    /// This function turns that `&str` into another [`Utf8Bytes`],
241    /// as if one had called `self.slice()` with the offsets that correspond to
242    /// `subset`.
243    ///
244    /// This operation is `O(1)`.
245    ///
246    /// # Examples
247    ///
248    /// ```
249    /// use utf8_bytes::Utf8Bytes;
250    ///
251    /// let bytes = Utf8Bytes::from("012345678");
252    /// let subset = &bytes[2..6];
253    /// let subslice = bytes.slice_ref(&subset);
254    /// assert_eq!(subslice, "2345");
255    /// ```
256    ///
257    /// # Panics
258    ///
259    /// Requires that the given `subset` slice is in fact contained within the
260    /// [`Utf8Bytes`] buffer; otherwise this function will panic.
261    pub fn slice_ref(&self, subset: &str) -> Self {
262        // SAFETY:
263        // - subset: &str _and_ the forwarded call does the bounds checks
264        unsafe { Self::from_bytes_unchecked(self.inner().slice_ref(subset.as_bytes())) }
265    }
266
267    /// Splits the bytes into two at the given index.
268    ///
269    /// Afterwards `self` contains elements `[0, at)`, and the returned `Bytes`
270    /// contains elements `[at, len)`. It's guaranteed that the memory does not
271    /// move, that is, the address of `self` does not change, and the address of
272    /// the returned slice is `at` bytes after that.
273    ///
274    /// This is an `O(1)` operation that just increases the reference count and
275    /// sets a few indices.
276    ///
277    /// # Examples
278    ///
279    /// ```
280    /// use utf8_bytes::Utf8Bytes;
281    ///
282    /// let mut a = Utf8Bytes::from("hello world");
283    /// let b = a.split_off(5);
284    ///
285    /// assert_eq!(a, "hello");
286    /// assert_eq!(b, " world");
287    /// ```
288    ///
289    /// # Panics
290    ///
291    /// Panics if `at > len` or does not lie on a char boundary.
292    #[must_use = "consider Bytes::truncate if you don't need the other half"]
293    pub fn split_off(&mut self, at: usize) -> Self {
294        let _char_boundary = self.as_str().split_at(at);
295        // SAFETY:
296        // - checked boundary above
297        unsafe { Self::from_bytes_unchecked(self.inner_mut().split_off(at)) }
298    }
299
300    /// Splits the bytes into two at the given index.
301    ///
302    /// Afterwards `self` contains elements `[at, len)`, and the returned
303    /// `Bytes` contains elements `[0, at)`.
304    ///
305    /// This is an `O(1)` operation that just increases the reference count and
306    /// sets a few indices.
307    ///
308    /// # Examples
309    ///
310    /// ```
311    /// use utf8_bytes::Utf8Bytes;
312    ///
313    /// let mut a = Utf8Bytes::from("hello world");
314    /// let b = a.split_to(5);
315    ///
316    /// assert_eq!(a, " world");
317    /// assert_eq!(b, "hello");
318    /// ```
319    ///
320    /// # Panics
321    ///
322    /// Panics if `at > len` or does not lie on a char boundary.
323    #[must_use = "consider Bytes::advance if you don't need the other half"]
324    pub fn split_to(&mut self, at: usize) -> Self {
325        let _char_boundary = self.as_str().split_at(at);
326        // SAFETY:
327        // - checked boundary above
328        unsafe { Self::from_bytes_unchecked(self.inner_mut().split_to(at)) }
329    }
330
331    /// Shortens the buffer, keeping the first `len` bytes and dropping the
332    /// rest.
333    ///
334    /// If `len` is greater than the buffer's current length, this has no
335    /// effect.
336    ///
337    /// The [split_off](`Self::split_off()`) method can emulate `truncate`, but this causes the
338    /// excess bytes to be returned instead of dropped.
339    ///
340    /// # Examples
341    ///
342    /// ```
343    /// use utf8_bytes::Utf8Bytes;
344    ///
345    /// let mut buf = Utf8Bytes::from("hello world");
346    /// buf.truncate(5);
347    /// assert_eq!(buf, "hello");
348    /// ```
349    ///
350    /// # Panics
351    ///
352    /// If `len` does not lie on a char boundary.
353    #[inline]
354    pub fn truncate(&mut self, len: usize) {
355        if len < self.len() {
356            let _char_boundary = self.as_str().split_at(len);
357            // SAFETY:
358            // - checked char boundary above
359            unsafe { self.inner_mut().truncate(len) }
360        };
361    }
362
363    /// Clears the buffer, removing all data.
364    ///
365    /// # Examples
366    ///
367    /// ```
368    /// use utf8_bytes::Utf8Bytes;
369    ///
370    /// let mut buf = Utf8Bytes::from("hello world");
371    /// buf.clear();
372    /// assert!(buf.is_empty());
373    /// ```
374    #[inline]
375    pub fn clear(&mut self) {
376        self.truncate(0);
377    }
378
379    /// Try to convert self into `BytesMut`.
380    ///
381    /// If `self` is unique for the entire original buffer, this will succeed
382    /// and return a `BytesMut` with the contents of `self` without copying.
383    /// If `self` is not unique for the entire original buffer, this will fail
384    /// and return self.
385    ///
386    /// This will also always fail if the buffer was constructed via either
387    /// [from_owner](Bytes::from_owner) or [from_static](Bytes::from_static).
388    ///
389    /// # Examples
390    ///
391    /// ```
392    /// use utf8_bytes::{Utf8Bytes, Utf8BytesMut};
393    ///
394    /// let bytes = Utf8Bytes::from("hello".to_string());
395    /// assert_eq!(bytes.try_into_mut(), Ok(Utf8BytesMut::from("hello")));
396    /// ```
397    pub fn try_into_mut(self) -> Result<Utf8BytesMut, Utf8Bytes> {
398        match self.into_inner().try_into_mut() {
399            // SAFETY:
400            // - the bytes came from `self`
401            Ok(it) => Ok(unsafe { Utf8BytesMut::from_bytes_mut_unchecked(it) }),
402            Err(it) => Err(unsafe { Self::from_bytes_unchecked(it) }),
403        }
404    }
405}
406
407impl fmt::Debug for Utf8Bytes {
408    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
409        self.as_str().fmt(f)
410    }
411}
412
413impl fmt::Display for Utf8Bytes {
414    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
415        self.as_str().fmt(f)
416    }
417}
418
419impl Clone for Utf8Bytes {
420    #[inline]
421    fn clone(&self) -> Utf8Bytes {
422        unsafe { Self::from_bytes_unchecked(self.inner().clone()) }
423    }
424    fn clone_from(&mut self, source: &Self) {
425        self.inner().clone_from(&source.inner());
426    }
427}
428
429impl Deref for Utf8Bytes {
430    type Target = str;
431
432    #[inline]
433    fn deref(&self) -> &str {
434        self.as_str()
435    }
436}
437
438impl AsRef<str> for Utf8Bytes {
439    #[inline]
440    fn as_ref(&self) -> &str {
441        self.as_str()
442    }
443}
444
445impl AsRef<[u8]> for Utf8Bytes {
446    #[inline]
447    fn as_ref(&self) -> &[u8] {
448        self.as_str().as_bytes()
449    }
450}
451
452impl hash::Hash for Utf8Bytes {
453    fn hash<H>(&self, state: &mut H)
454    where
455        H: hash::Hasher,
456    {
457        self.as_str().hash(state);
458    }
459}
460
461impl Borrow<str> for Utf8Bytes {
462    fn borrow(&self) -> &str {
463        self.as_str()
464    }
465}
466
467impl FromIterator<char> for Utf8Bytes {
468    fn from_iter<T: IntoIterator<Item = char>>(into_iter: T) -> Self {
469        String::from_iter(into_iter).into()
470    }
471}
472
473// impl Eq
474
475impl<T: AsRef<str>> PartialEq<T> for Utf8Bytes {
476    fn eq(&self, other: &T) -> bool {
477        self.as_str() == other.as_ref()
478    }
479}
480
481impl<T: AsRef<str>> PartialOrd<T> for Utf8Bytes {
482    fn partial_cmp(&self, other: &T) -> Option<cmp::Ordering> {
483        self.as_str().partial_cmp(other.as_ref())
484    }
485}
486
487impl Ord for Utf8Bytes {
488    fn cmp(&self, other: &Utf8Bytes) -> cmp::Ordering {
489        self.as_str().cmp(other.as_str())
490    }
491}
492
493impl Eq for Utf8Bytes {}
494
495impl PartialEq<Utf8Bytes> for str {
496    fn eq(&self, other: &Utf8Bytes) -> bool {
497        self.eq(other.as_str())
498    }
499}
500impl PartialEq<Utf8Bytes> for String {
501    fn eq(&self, other: &Utf8Bytes) -> bool {
502        self.eq(other.as_str())
503    }
504}
505impl<'a> PartialEq<Utf8Bytes> for Cow<'a, str> {
506    fn eq(&self, other: &Utf8Bytes) -> bool {
507        self.eq(other.as_str())
508    }
509}
510
511impl PartialOrd<Utf8Bytes> for str {
512    fn partial_cmp(&self, other: &Utf8Bytes) -> Option<cmp::Ordering> {
513        self.partial_cmp(other.as_str())
514    }
515}
516impl PartialOrd<Utf8Bytes> for String {
517    fn partial_cmp(&self, other: &Utf8Bytes) -> Option<cmp::Ordering> {
518        self.as_str().partial_cmp(other.as_str())
519    }
520}
521impl PartialOrd<Utf8Bytes> for Cow<'_, str> {
522    fn partial_cmp(&self, other: &Utf8Bytes) -> Option<cmp::Ordering> {
523        (**self).partial_cmp(other.as_str())
524    }
525}
526
527// impl From
528
529impl Default for Utf8Bytes {
530    #[inline]
531    fn default() -> Utf8Bytes {
532        Utf8Bytes::new()
533    }
534}
535
536impl From<&'static str> for Utf8Bytes {
537    fn from(s: &'static str) -> Utf8Bytes {
538        Utf8Bytes::from_static(s)
539    }
540}
541
542impl From<Box<str>> for Utf8Bytes {
543    fn from(slice: Box<str>) -> Utf8Bytes {
544        unsafe { Self::from_bytes_unchecked(bytes::Bytes::from(slice.into_boxed_bytes())) }
545    }
546}
547
548impl From<Utf8Bytes> for bytes::Bytes {
549    fn from(utf8: Utf8Bytes) -> Self {
550        utf8.into_inner()
551    }
552}
553
554impl From<Utf8Bytes> for Utf8BytesMut {
555    /// Convert self into [`Utf8BytesMut`].
556    ///
557    /// If `bytes` is unique for the entire original buffer, this will return a
558    /// `BytesMut` with the contents of `bytes` without copying.
559    /// If `bytes` is not unique for the entire original buffer, this will make
560    /// a copy of `bytes` subset of the original buffer in a new `BytesMut`.
561    ///
562    /// # Examples
563    ///
564    /// ```
565    /// use utf8_bytes::{Utf8Bytes, Utf8BytesMut};
566    ///
567    /// let bytes = Utf8Bytes::copy_from_str("hello");
568    /// assert_eq!(Utf8BytesMut::from(bytes), "hello");
569    /// ```
570    fn from(bytes: Utf8Bytes) -> Self {
571        // SAFETY:
572        // - `bytes` is preserved.
573        unsafe { Self::from_bytes_mut_unchecked(bytes.into_inner().into()) }
574    }
575}
576
577impl From<String> for Utf8Bytes {
578    fn from(s: String) -> Utf8Bytes {
579        // SAFETY:
580        // - s contains UTF-8.
581        unsafe { Utf8Bytes::from_bytes_unchecked(bytes::Bytes::from(s.into_bytes())) }
582    }
583}
584
585impl From<Utf8Bytes> for Vec<u8> {
586    fn from(utf8: Utf8Bytes) -> Vec<u8> {
587        utf8.into_inner().into()
588    }
589}
590
591impl From<Utf8Bytes> for String {
592    fn from(utf8: Utf8Bytes) -> Self {
593        // SAFETY:
594        // - only contains UTF-8
595        unsafe { String::from_utf8_unchecked(utf8.into()) }
596    }
597}