wtf8_rs/wtf8/
mod.rs

1//! A WTF-8 slice.
2
3#[cfg(test)]
4mod tests;
5
6use crate::wtf8buf::Wtf8Buf;
7use crate::{codepoint, decode_surrogate, CodePoint};
8use alloc::borrow::Cow;
9use alloc::boxed::Box;
10use alloc::rc::Rc;
11use alloc::sync::Arc;
12use alloc::vec::Vec;
13use core::iter::FusedIterator;
14use core::ops::Index;
15use core::{fmt, slice, str};
16
17mod index;
18
19pub use index::*;
20
21/// A WTF-8 slice.
22#[derive(PartialEq, Eq, PartialOrd, Ord)]
23#[repr(transparent)]
24pub struct Wtf8 {
25    bytes: [u8],
26}
27
28impl Wtf8 {
29    #[inline]
30    pub(crate) fn bytes(&self) -> &[u8] {
31        &self.bytes
32    }
33
34    /// Coerces into a `Wtf8`. This accepts an [`&str`](prim@str) argument.
35    #[inline]
36    pub fn new<T: ?Sized + AsRef<Wtf8>>(x: &T) -> &Self {
37        x.as_ref()
38    }
39
40    /// Returns the length, in WTF-8 bytes.
41    #[inline]
42    pub fn len(&self) -> usize {
43        self.bytes.len()
44    }
45
46    /// Returns whether this is empty.
47    #[inline]
48    pub fn is_empty(&self) -> bool {
49        self.bytes.is_empty()
50    }
51
52    /// Returns the code point at `position` if it is in the ASCII range,
53    /// or `b'\xFF'` otherwise.
54    ///
55    /// # Panics
56    ///
57    /// Panics if `position` is beyond the end of the string.
58    #[inline]
59    pub fn ascii_byte_at(&self, position: usize) -> u8 {
60        match self.bytes[position] {
61            ascii_byte @ 0x00..=0x7F => ascii_byte,
62            _ => 0xFF,
63        }
64    }
65
66    /// Returns an iterator for the string’s code points.
67    #[inline]
68    pub fn code_points(&self) -> CodePoints<'_> {
69        CodePoints {
70            bytes: self.bytes.iter(),
71        }
72    }
73
74    /// Tries to convert the string to UTF-8 and return a [`&str`](prim@str) slice.
75    ///
76    /// Returns `Err(_)` if the string contains surrogates.
77    ///
78    /// This does not copy the data.
79    pub fn to_str(&self) -> Result<&str, ToStrError> {
80        let mut chunks = self.chunks();
81
82        let x = match chunks.next() {
83            Some(Wtf8Chunk::Utf8(str)) => str,
84            Some(Wtf8Chunk::UnpairedSurrogate(_)) => return Err(ToStrError { valid_up_to: 0 }),
85            None => return Ok(""),
86        };
87
88        if chunks.next().is_some() {
89            return Err(ToStrError {
90                valid_up_to: x.len(),
91            });
92        }
93
94        Ok(x)
95    }
96
97    /// Converts this string into a iterator of [`Wtf8Chunk`].
98    ///
99    /// The resulting iterator will intercalate [`Utf8`](Wtf8Chunk::Utf8) chunks
100    /// with one or more [`UnpairedSurrogate`](Wtf8Chunk::UnpairedSurrogate) and
101    /// all contained codepoints can be recovered from it.
102    #[inline]
103    pub fn chunks(&self) -> Chunks {
104        Chunks(&self.bytes)
105    }
106
107    /// Lossily converts the string to UTF-8.
108    /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
109    ///
110    /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
111    ///
112    /// This only copies the data if necessary (if it contains any surrogate).
113    pub fn to_string_lossy(&self) -> Cow<str> {
114        let mut chunks = self.chunks();
115
116        if chunks.next_surrogate().is_none() {
117            return Cow::Borrowed(chunks.next().and_then(Wtf8Chunk::utf8).unwrap_or(""));
118        }
119
120        let chunks: Vec<_> = chunks.map(|a| a.utf8().unwrap_or("\u{FFFD}")).collect();
121        Cow::Owned(chunks.join(""))
122    }
123
124    /// Returns a slice of the given string for the byte range.
125    ///
126    /// Returns `None` whenever [`index`](#impl-Index<T>) would panic.
127    #[inline]
128    pub fn get<I: Wtf8Index>(&self, i: I) -> Option<&Self> {
129        i.get(self)
130    }
131
132    /// Converts the WTF-8 string to potentially ill-formed UTF-16
133    /// and return an iterator of 16-bit code units.
134    #[inline]
135    pub fn encode_utf16(&self) -> EncodeUtf16<'_> {
136        EncodeUtf16(CodePoint::encode_utf16(self.code_points()))
137    }
138
139    /// Returns a slice of the given string for the byte range.
140    ///
141    /// # Safety
142    ///
143    /// Produces undefined behaviour whenever [`index`](#impl-Index<T>) would panic.
144    #[inline]
145    pub unsafe fn get_unchecked<I: Wtf8Index>(&self, i: I) -> &Self {
146        &*i.get_unchecked(self)
147    }
148
149    /// Whether a given index is at a code point boundary.
150    #[inline]
151    pub fn is_code_point_boundary(&self, index: usize) -> bool {
152        if index == self.len() {
153            return true;
154        }
155        !matches!(self.bytes.get(index), None | Some(128..=191))
156    }
157
158    /// Boxes this `Wtf8`.
159    #[inline]
160    pub fn to_box(&self) -> Box<Wtf8> {
161        let boxed: Box<[u8]> = self.bytes.into();
162        // Safety: This is sound as type layouts match
163        unsafe { Box::from_raw(Box::into_raw(boxed) as *mut Wtf8) }
164    }
165
166    /// Creates a boxed, empty `Wtf8`.
167    pub fn empty_box() -> Box<Wtf8> {
168        let boxed: Box<[u8]> = Default::default();
169        // Safety: This is sound as type layouts match
170        unsafe { Box::from_raw(Box::into_raw(boxed) as *mut Wtf8) }
171    }
172
173    /// Boxes this `Wtf8` with [`Arc`](alloc::sync::Arc).
174    #[inline]
175    pub fn to_arc(&self) -> Arc<Wtf8> {
176        let arc: Arc<[u8]> = Arc::from(&self.bytes);
177        // Safety: This is sound as type layouts match
178        unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) }
179    }
180
181    /// Boxes this `Wtf8` with [`Rc`](alloc::rc::Rc).
182    #[inline]
183    pub fn to_rc(&self) -> Rc<Wtf8> {
184        let rc: Rc<[u8]> = Rc::from(&self.bytes);
185        // Safety: This is sound as type layouts match
186        unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) }
187    }
188
189    /// Converts this slice to its ASCII lower case equivalent in-place.
190    ///
191    /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
192    /// but non-ASCII letters are unchanged.
193    ///
194    /// To return a new lowercased value without modifying the existing one, use
195    /// [`to_ascii_lowercase`].
196    ///
197    /// [`to_ascii_lowercase`]: #method.to_ascii_lowercase
198    #[inline]
199    pub fn make_ascii_lowercase(&mut self) {
200        self.bytes.make_ascii_lowercase()
201    }
202
203    /// Converts this slice to its ASCII upper case equivalent in-place.
204    ///
205    /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
206    /// but non-ASCII letters are unchanged.
207    ///
208    /// To return a new uppercased value without modifying the existing one, use
209    /// [`to_ascii_uppercase`].
210    ///
211    /// [`to_ascii_uppercase`]: #method.to_ascii_uppercase
212    #[inline]
213    pub fn make_ascii_uppercase(&mut self) {
214        self.bytes.make_ascii_uppercase()
215    }
216
217    /// Returns a [`Wtf8Buf`] containing a copy of this slice where each byte
218    /// is mapped to its ASCII lower case equivalent.
219    ///
220    /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
221    /// but non-ASCII letters are unchanged.
222    #[inline]
223    pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
224        Wtf8Buf::from_bytes(self.bytes.to_ascii_lowercase())
225    }
226
227    /// Returns a [`Wtf8Buf`] containing a copy of this slice where each byte
228    /// is mapped to its ASCII upper case equivalent.
229    ///
230    /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
231    /// but non-ASCII letters are unchanged.
232    ///
233    /// To uppercase the value in-place, use [`make_ascii_uppercase`].
234    ///
235    /// [`make_ascii_uppercase`]: #method.make_ascii_uppercase
236    #[inline]
237    pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
238        Wtf8Buf::from_bytes(self.bytes.to_ascii_uppercase())
239    }
240
241    /// Checks if all bytes in this slice are within the ASCII range.
242    #[inline]
243    pub fn is_ascii(&self) -> bool {
244        self.bytes.is_ascii()
245    }
246
247    /// Checks that two slices are an ASCII case-insensitive match.
248    ///
249    /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`,
250    /// but without allocating and copying temporaries.
251    #[inline]
252    pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
253        self.bytes.eq_ignore_ascii_case(&other.bytes)
254    }
255
256    #[inline]
257    pub(crate) fn initial_trail_surrogate(&self) -> Option<u16> {
258        match self.bytes {
259            [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
260            _ => None,
261        }
262    }
263}
264
265impl From<&Wtf8> for Box<Wtf8> {
266    #[inline]
267    fn from(x: &Wtf8) -> Self {
268        x.to_box()
269    }
270}
271
272impl From<&Wtf8> for Rc<Wtf8> {
273    #[inline]
274    fn from(x: &Wtf8) -> Self {
275        x.to_rc()
276    }
277}
278
279impl From<&Wtf8> for Arc<Wtf8> {
280    #[inline]
281    fn from(x: &Wtf8) -> Self {
282        x.to_arc()
283    }
284}
285
286impl fmt::Debug for Wtf8 {
287    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
288        use core::fmt::Write;
289
290        formatter.write_str("\"")?;
291
292        for c in self.chunks() {
293            match c {
294                Wtf8Chunk::Utf8(c) => {
295                    for ch in c.chars().flat_map(|x| x.escape_debug()) {
296                        formatter.write_char(ch)?;
297                    }
298                }
299                Wtf8Chunk::UnpairedSurrogate(e) => write!(formatter, "\\u{{{:x}}}", e)?,
300            }
301        }
302
303        formatter.write_str("\"")
304    }
305}
306
307impl fmt::Display for Wtf8 {
308    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
309        for chunk in self.chunks() {
310            formatter.write_str(chunk.utf8().unwrap_or("\u{FFFD}"))?;
311        }
312
313        Ok(())
314    }
315}
316
317/// Returns a slice of the given string for the byte range.
318///
319/// # Panics
320///
321/// Panics when the boundaries of the range do not point to code point
322/// boundaries, or point beyond the end of the string.
323impl<T: Wtf8Index> Index<T> for Wtf8 {
324    type Output = Wtf8;
325
326    #[inline]
327    fn index(&self, index: T) -> &Wtf8 {
328        match self.get(index.clone()) {
329            Some(x) => x,
330            None => panic!(
331                "index {:?} in `{:?}` do not lie on character boundary",
332                index, self
333            ),
334        }
335    }
336}
337
338impl AsRef<Wtf8> for str {
339    #[inline]
340    fn as_ref(&self) -> &Wtf8 {
341        // Safety: the cast is sound because repr(transparent), matching the layout of str.
342        // UTF-8 is a subset of WTF-8, so type invariants are never violated.
343        unsafe { &*(self as *const str as *const Wtf8) }
344    }
345}
346
347/// Errors which can occur when converting `Wtf8` to `str`.
348#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
349pub struct ToStrError {
350    valid_up_to: usize,
351}
352impl ToStrError {
353    /// Returns the index in the given string up to which valid UTF-8 was
354    /// verified.
355    ///
356    /// It is the maximum index such that `wstr[..index].to_str()` would
357    /// return `Ok(_)`.
358    #[inline]
359    pub fn valid_up_to(&self) -> usize {
360        self.valid_up_to
361    }
362
363    /// The length provided is that of the invalid byte sequence
364    /// that starts at the index given by `valid_up_to()`.
365    /// Decoding should resume after that sequence
366    /// (after inserting a [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]) in case
367    /// of lossy decoding.
368    ///
369    /// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html
370    #[inline]
371    pub fn error_len(&self) -> usize {
372        3
373    }
374}
375impl fmt::Display for ToStrError {
376    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
377        write!(
378            f,
379            "invalid utf-8 sequence of 3 bytes from index {}",
380            self.valid_up_to
381        )
382    }
383}
384
385/// Iterator of points over a string.
386pub struct CodePoints<'a> {
387    bytes: slice::Iter<'a, u8>,
388}
389impl Iterator for CodePoints<'_> {
390    type Item = CodePoint;
391
392    #[inline]
393    fn next(&mut self) -> Option<CodePoint> {
394        // Copied from core::str::next_code_point
395
396        /// Mask of the value bits of a continuation byte.
397        const CONT_MASK: u8 = 0b0011_1111;
398
399        /// Returns the initial codepoint accumulator for the first byte.
400        /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
401        /// for width 3, and 3 bits for width 4.
402        #[inline]
403        fn utf8_first_byte(byte: u8, width: u32) -> u32 {
404            (byte & (0x7F >> width)) as u32
405        }
406
407        /// Returns the value of `ch` updated with continuation byte `byte`.
408        #[inline]
409        fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
410            (ch << 6) | (byte & CONT_MASK) as u32
411        }
412
413        #[inline]
414        fn unwrap_or_0(opt: Option<&u8>) -> u8 {
415            match opt {
416                Some(&byte) => byte,
417                None => 0,
418            }
419        }
420
421        let x = *self.bytes.next()?;
422        if x < 128 {
423            // Safety: the char is ascii.
424            return Some(unsafe { CodePoint::from_u32_unchecked(x as u32) });
425        }
426
427        // Multibyte case follows
428        // Decode from a byte combination out of: [[[x y] z] w]
429        // NOTE: Performance is sensitive to the exact formulation here
430        let init = utf8_first_byte(x, 2);
431        let y = unwrap_or_0(self.bytes.next());
432        let mut ch = utf8_acc_cont_byte(init, y);
433        if x >= 0xE0 {
434            // [[x y z] w] case
435            // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
436            let z = unwrap_or_0(self.bytes.next());
437            let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
438            ch = init << 12 | y_z;
439            if x >= 0xF0 {
440                // [x y z w] case
441                // use only the lower 3 bits of `init`
442                let w = unwrap_or_0(self.bytes.next());
443                ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
444            }
445        }
446
447        // Safety: the code point can not be greater than 0x10_FFFF.
448        Some(unsafe { CodePoint::from_u32_unchecked(ch) })
449    }
450
451    #[inline]
452    fn size_hint(&self) -> (usize, Option<usize>) {
453        let v = self.bytes.len();
454        (v.saturating_add(3) / 4, Some(v))
455    }
456}
457impl FusedIterator for CodePoints<'_> {}
458
459/// An iterator for encoding potentially ill-formed UTF-16 from a WTF-8 input.
460pub struct EncodeUtf16<'a>(codepoint::EncodeUtf16<CodePoints<'a>>);
461impl Iterator for EncodeUtf16<'_> {
462    type Item = u16;
463
464    #[inline]
465    fn next(&mut self) -> Option<u16> {
466        self.0.next()
467    }
468
469    #[inline]
470    fn size_hint(&self) -> (usize, Option<usize>) {
471        self.0.size_hint()
472    }
473}
474impl FusedIterator for EncodeUtf16<'_> {}
475
476/// Part of a WTF-8 slice.
477///
478/// Either an [`Utf8`](Self::Utf8) string, or a [`UnpairedSurrogate`](Self::UnpairedSurrogate).
479#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
480pub enum Wtf8Chunk<'a> {
481    /// The chunk as a UTF-8 string.
482    Utf8(&'a str),
483
484    /// The chunk as an unpaired surrogate.
485    UnpairedSurrogate(u16),
486}
487
488impl<'a> Wtf8Chunk<'a> {
489    /// Returns `Some(_)` if the chunk is UTF-8, and `None` if not.
490    #[inline]
491    pub fn utf8(self) -> Option<&'a str> {
492        match self {
493            Wtf8Chunk::Utf8(a) => Some(a),
494            _ => None,
495        }
496    }
497}
498
499/// An iterator created by [`chunks`](Wtf8::chunks).
500pub struct Chunks<'a>(&'a [u8]);
501impl Chunks<'_> {
502    #[inline]
503    pub(crate) fn next_surrogate(&self) -> Option<usize> {
504        let mut pos = 0;
505        let mut iter = self.0.iter();
506
507        loop {
508            let b = *iter.next()?;
509            if b < 0x80 {
510                pos += 1;
511            } else if b < 0xE0 {
512                iter.next();
513                pos += 2;
514            } else if b == 0xED {
515                match (iter.next(), iter.next()) {
516                    (Some(&b2), Some(_)) if b2 >= 0xA0 => {
517                        return Some(pos);
518                    }
519                    _ => pos += 3,
520                }
521            } else if b < 0xF0 {
522                iter.next();
523                iter.next();
524                pos += 3;
525            } else {
526                iter.next();
527                iter.next();
528                iter.next();
529                pos += 4;
530            }
531        }
532    }
533}
534impl<'a> Iterator for Chunks<'a> {
535    type Item = Wtf8Chunk<'a>;
536
537    #[inline]
538    fn next(&mut self) -> Option<Wtf8Chunk<'a>> {
539        match self.next_surrogate() {
540            Some(0) => {
541                let s = decode_surrogate(self.0[1], self.0[2]);
542                self.0 = &self.0[3..];
543                Some(Wtf8Chunk::UnpairedSurrogate(s))
544            }
545
546            Some(x) => {
547                let r = &self.0[..x];
548                self.0 = &self.0[x..];
549                // Safety: there are no surrogates, therefore the string is UTF-8.
550                Some(Wtf8Chunk::Utf8(unsafe { str::from_utf8_unchecked(r) }))
551            }
552
553            None if self.0.is_empty() => None,
554
555            None => {
556                let r = self.0;
557                self.0 = &[];
558                // Safety: there are no surrogates, therefore the string is UTF-8.
559                Some(Wtf8Chunk::Utf8(unsafe { str::from_utf8_unchecked(r) }))
560            }
561        }
562    }
563}