rustpython_wtf8/
lib.rs

1// spell-checker:disable
2
3//! An implementation of [WTF-8], a utf8-compatible encoding that allows for
4//! unpaired surrogate codepoints. This implementation additionally allows for
5//! paired surrogates that are nonetheless treated as two separate codepoints.
6//!
7//!
8//! RustPython uses this because CPython internally uses a variant of UCS-1/2/4
9//! as its string storage, which treats each `u8`/`u16`/`u32` value (depending
10//! on the highest codepoint value in the string) as simply integers, unlike
11//! UTF-8 or UTF-16 where some characters are encoded using multi-byte
12//! sequences. CPython additionally doesn't disallow the use of surrogates in
13//! `str`s (which in UTF-16 pair together to represent codepoints with a value
14//! higher than `u16::MAX`) and in fact takes quite extensive advantage of the
15//! fact that they're allowed. The `surrogateescape` codec-error handler uses
16//! them to represent byte sequences which are invalid in the given codec (e.g.
17//! bytes with their high bit set in ASCII or UTF-8) by mapping them into the
18//! surrogate range. `surrogateescape` is the default error handler in Python
19//! for interacting with the filesystem, and thus if RustPython is to properly
20//! support `surrogateescape`, its `str`s must be able to represent surrogates.
21//!
22//! We use WTF-8 over something more similar to CPython's string implementation
23//! because of its compatibility with UTF-8, meaning that in the case where a
24//! string has no surrogates, it can be viewed as a UTF-8 Rust [`prim@str`] without
25//! needing any copies or re-encoding.
26//!
27//! This implementation is mostly copied from the WTF-8 implementation in the
28//! Rust 1.85 standard library, which is used as the backing for [`OsStr`] on
29//! Windows targets. As previously mentioned, however, it is modified to not
30//! join two surrogates into one codepoint when concatenating strings, in order
31//! to match CPython's behavior.
32//!
33//! [WTF-8]: https://simonsapin.github.io/wtf-8
34//! [`OsStr`]: std::ffi::OsStr
35
36#![no_std]
37#![allow(clippy::precedence, clippy::match_overlapping_arm)]
38
39extern crate alloc;
40
41use alloc::borrow::{Cow, ToOwned};
42use alloc::boxed::Box;
43use alloc::collections::TryReserveError;
44use alloc::string::String;
45use alloc::vec::Vec;
46use core::borrow::Borrow;
47use core::fmt;
48use core::hash::{Hash, Hasher};
49use core::iter::FusedIterator;
50use core::mem;
51use core::ops;
52use core::slice;
53use core::str;
54use core_char::MAX_LEN_UTF8;
55use core_char::{MAX_LEN_UTF16, encode_utf8_raw, encode_utf16_raw, len_utf8};
56use core_str::{next_code_point, next_code_point_reverse};
57use itertools::{Either, Itertools};
58
59use bstr::{ByteSlice, ByteVec};
60
61mod core_char;
62mod core_str;
63mod core_str_count;
64
65const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
66
67/// A Unicode code point: from U+0000 to U+10FFFF.
68///
69/// Compares with the `char` type,
70/// which represents a Unicode scalar value:
71/// a code point that is not a surrogate (U+D800 to U+DFFF).
72#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
73pub struct CodePoint {
74    value: u32,
75}
76
77/// Format the code point as `U+` followed by four to six hexadecimal digits.
78/// Example: `U+1F4A9`
79impl fmt::Debug for CodePoint {
80    #[inline]
81    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
82        write!(formatter, "U+{:04X}", self.value)
83    }
84}
85
86impl fmt::Display for CodePoint {
87    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
88        self.to_char_lossy().fmt(f)
89    }
90}
91
92impl CodePoint {
93    /// Unsafely creates a new `CodePoint` without checking the value.
94    ///
95    /// # Safety
96    ///
97    /// `value` must be less than or equal to 0x10FFFF.
98    #[inline]
99    pub const unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
100        CodePoint { value }
101    }
102
103    /// Creates a new `CodePoint` if the value is a valid code point.
104    ///
105    /// Returns `None` if `value` is above 0x10FFFF.
106    #[inline]
107    pub const fn from_u32(value: u32) -> Option<CodePoint> {
108        match value {
109            0..=0x10FFFF => Some(CodePoint { value }),
110            _ => None,
111        }
112    }
113
114    /// Creates a new `CodePoint` from a `char`.
115    ///
116    /// Since all Unicode scalar values are code points, this always succeeds.
117    #[inline]
118    pub const fn from_char(value: char) -> CodePoint {
119        CodePoint {
120            value: value as u32,
121        }
122    }
123
124    /// Returns the numeric value of the code point.
125    #[inline]
126    pub const fn to_u32(self) -> u32 {
127        self.value
128    }
129
130    /// Returns the numeric value of the code point if it is a leading surrogate.
131    #[inline]
132    pub const fn to_lead_surrogate(self) -> Option<LeadSurrogate> {
133        match self.value {
134            lead @ 0xD800..=0xDBFF => Some(LeadSurrogate(lead as u16)),
135            _ => None,
136        }
137    }
138
139    /// Returns the numeric value of the code point if it is a trailing surrogate.
140    #[inline]
141    pub const fn to_trail_surrogate(self) -> Option<TrailSurrogate> {
142        match self.value {
143            trail @ 0xDC00..=0xDFFF => Some(TrailSurrogate(trail as u16)),
144            _ => None,
145        }
146    }
147
148    /// Optionally returns a Unicode scalar value for the code point.
149    ///
150    /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
151    #[inline]
152    pub const fn to_char(self) -> Option<char> {
153        match self.value {
154            0xD800..=0xDFFF => None,
155            _ => Some(unsafe { char::from_u32_unchecked(self.value) }),
156        }
157    }
158
159    /// Returns a Unicode scalar value for the code point.
160    ///
161    /// Returns `'\u{FFFD}'` (the replacement character “�”)
162    /// if the code point is a surrogate (from U+D800 to U+DFFF).
163    #[inline]
164    pub fn to_char_lossy(self) -> char {
165        self.to_char().unwrap_or('\u{FFFD}')
166    }
167
168    pub fn is_char_and(self, f: impl FnOnce(char) -> bool) -> bool {
169        self.to_char().is_some_and(f)
170    }
171
172    pub fn encode_wtf8(self, dst: &mut [u8]) -> &mut Wtf8 {
173        unsafe { Wtf8::from_mut_bytes_unchecked(encode_utf8_raw(self.value, dst)) }
174    }
175
176    pub const fn len_wtf8(&self) -> usize {
177        len_utf8(self.value)
178    }
179
180    pub fn is_ascii(&self) -> bool {
181        self.is_char_and(|c| c.is_ascii())
182    }
183}
184
185impl From<u16> for CodePoint {
186    fn from(value: u16) -> Self {
187        unsafe { Self::from_u32_unchecked(value.into()) }
188    }
189}
190
191impl From<u8> for CodePoint {
192    fn from(value: u8) -> Self {
193        char::from(value).into()
194    }
195}
196
197impl From<char> for CodePoint {
198    fn from(value: char) -> Self {
199        Self::from_char(value)
200    }
201}
202
203impl From<ascii::AsciiChar> for CodePoint {
204    fn from(value: ascii::AsciiChar) -> Self {
205        Self::from_char(value.into())
206    }
207}
208
209impl From<CodePoint> for Wtf8Buf {
210    fn from(ch: CodePoint) -> Self {
211        ch.encode_wtf8(&mut [0; MAX_LEN_UTF8]).to_owned()
212    }
213}
214
215impl PartialEq<char> for CodePoint {
216    fn eq(&self, other: &char) -> bool {
217        self.to_u32() == *other as u32
218    }
219}
220impl PartialEq<CodePoint> for char {
221    fn eq(&self, other: &CodePoint) -> bool {
222        *self as u32 == other.to_u32()
223    }
224}
225
226#[derive(Clone, Copy)]
227pub struct LeadSurrogate(u16);
228
229#[derive(Clone, Copy)]
230pub struct TrailSurrogate(u16);
231
232impl LeadSurrogate {
233    pub const fn merge(self, trail: TrailSurrogate) -> char {
234        decode_surrogate_pair(self.0, trail.0)
235    }
236}
237
238/// An owned, growable string of well-formed WTF-8 data.
239///
240/// Similar to `String`, but can additionally contain surrogate code points
241/// if they’re not in a surrogate pair.
242#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Default)]
243pub struct Wtf8Buf {
244    bytes: Vec<u8>,
245}
246
247impl ops::Deref for Wtf8Buf {
248    type Target = Wtf8;
249
250    fn deref(&self) -> &Wtf8 {
251        self.as_slice()
252    }
253}
254
255impl ops::DerefMut for Wtf8Buf {
256    fn deref_mut(&mut self) -> &mut Wtf8 {
257        self.as_mut_slice()
258    }
259}
260
261impl Borrow<Wtf8> for Wtf8Buf {
262    fn borrow(&self) -> &Wtf8 {
263        self
264    }
265}
266
267/// Formats the string in double quotes, with characters escaped according to
268/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
269/// where each `x` is a hexadecimal digit.
270///
271/// For example, the code units [U+0061, U+D800, U+000A] are formatted as
272/// `"a\u{D800}\n"`.
273impl fmt::Debug for Wtf8Buf {
274    #[inline]
275    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
276        fmt::Debug::fmt(&**self, formatter)
277    }
278}
279
280/// Formats the string with unpaired surrogates substituted with the replacement
281/// character, U+FFFD.
282impl fmt::Display for Wtf8Buf {
283    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
284        fmt::Display::fmt(&**self, formatter)
285    }
286}
287
288impl Wtf8Buf {
289    /// Creates a new, empty WTF-8 string.
290    #[inline]
291    pub fn new() -> Wtf8Buf {
292        Wtf8Buf::default()
293    }
294
295    /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
296    #[inline]
297    pub fn with_capacity(capacity: usize) -> Wtf8Buf {
298        Wtf8Buf {
299            bytes: Vec::with_capacity(capacity),
300        }
301    }
302
303    /// Creates a WTF-8 string from a WTF-8 byte vec.
304    ///
305    /// # Safety
306    ///
307    /// `value` must contain valid WTF-8.
308    #[inline]
309    pub const unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
310        Wtf8Buf { bytes: value }
311    }
312
313    /// Create a WTF-8 string from a WTF-8 byte vec.
314    pub fn from_bytes(value: Vec<u8>) -> Result<Self, Vec<u8>> {
315        match Wtf8::from_bytes(&value) {
316            Some(_) => Ok(unsafe { Self::from_bytes_unchecked(value) }),
317            None => Err(value),
318        }
319    }
320
321    /// Creates a WTF-8 string from a UTF-8 `String`.
322    ///
323    /// This takes ownership of the `String` and does not copy.
324    ///
325    /// Since WTF-8 is a superset of UTF-8, this always succeeds.
326    #[inline]
327    pub fn from_string(string: String) -> Wtf8Buf {
328        Wtf8Buf {
329            bytes: string.into_bytes(),
330        }
331    }
332
333    pub fn join<I, S>(sep: impl AsRef<Wtf8>, iter: I) -> Wtf8Buf
334    where
335        I: IntoIterator<Item = S>,
336        S: AsRef<Wtf8>,
337    {
338        let sep = sep.as_ref();
339        let mut iter = iter.into_iter();
340        let mut buf = match iter.next() {
341            Some(first) => first.as_ref().to_owned(),
342            None => return Wtf8Buf::new(),
343        };
344        for part in iter {
345            buf.push_wtf8(sep);
346            buf.push_wtf8(part.as_ref());
347        }
348        buf
349    }
350
351    pub fn clear(&mut self) {
352        self.bytes.clear();
353    }
354
355    /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
356    ///
357    /// This is lossless: calling `.encode_wide()` on the resulting string
358    /// will always return the original code units.
359    pub fn from_wide(v: &[u16]) -> Wtf8Buf {
360        let mut string = Wtf8Buf::with_capacity(v.len());
361        for item in char::decode_utf16(v.iter().cloned()) {
362            match item {
363                Ok(ch) => string.push_char(ch),
364                Err(surrogate) => {
365                    let surrogate = surrogate.unpaired_surrogate();
366                    // Surrogates are known to be in the code point range.
367                    let code_point = CodePoint::from(surrogate);
368                    // Skip the WTF-8 concatenation check,
369                    // surrogate pairs are already decoded by decode_utf16
370                    string.push(code_point);
371                }
372            }
373        }
374        string
375    }
376
377    #[inline]
378    pub fn as_slice(&self) -> &Wtf8 {
379        unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
380    }
381
382    #[inline]
383    pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
384        // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
385        // cause them to change from well-formed UTF-8 to ill-formed UTF-8,
386        // which would break the assumptions of the `is_known_utf8` field.
387        unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
388    }
389
390    /// Reserves capacity for at least `additional` more bytes to be inserted
391    /// in the given `Wtf8Buf`.
392    /// The collection may reserve more space to avoid frequent reallocations.
393    ///
394    /// # Panics
395    ///
396    /// Panics if the new capacity exceeds `isize::MAX` bytes.
397    #[inline]
398    pub fn reserve(&mut self, additional: usize) {
399        self.bytes.reserve(additional)
400    }
401
402    /// Tries to reserve capacity for at least `additional` more bytes to be
403    /// inserted in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to
404    /// avoid frequent reallocations. After calling `try_reserve`, capacity will
405    /// be greater than or equal to `self.len() + additional`. Does nothing if
406    /// capacity is already sufficient. This method preserves the contents even
407    /// if an error occurs.
408    ///
409    /// # Errors
410    ///
411    /// If the capacity overflows, or the allocator reports a failure, then an error
412    /// is returned.
413    #[inline]
414    pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
415        self.bytes.try_reserve(additional)
416    }
417
418    #[inline]
419    pub fn reserve_exact(&mut self, additional: usize) {
420        self.bytes.reserve_exact(additional)
421    }
422
423    /// Tries to reserve the minimum capacity for exactly `additional` more
424    /// bytes to be inserted in the given `Wtf8Buf`. After calling
425    /// `try_reserve_exact`, capacity will be greater than or equal to
426    /// `self.len() + additional` if it returns `Ok(())`.
427    /// Does nothing if the capacity is already sufficient.
428    ///
429    /// Note that the allocator may give the `Wtf8Buf` more space than it
430    /// requests. Therefore, capacity can not be relied upon to be precisely
431    /// minimal. Prefer [`try_reserve`] if future insertions are expected.
432    ///
433    /// [`try_reserve`]: Wtf8Buf::try_reserve
434    ///
435    /// # Errors
436    ///
437    /// If the capacity overflows, or the allocator reports a failure, then an error
438    /// is returned.
439    #[inline]
440    pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
441        self.bytes.try_reserve_exact(additional)
442    }
443
444    #[inline]
445    pub fn shrink_to_fit(&mut self) {
446        self.bytes.shrink_to_fit()
447    }
448
449    #[inline]
450    pub fn shrink_to(&mut self, min_capacity: usize) {
451        self.bytes.shrink_to(min_capacity)
452    }
453
454    #[inline]
455    pub fn leak<'a>(self) -> &'a mut Wtf8 {
456        unsafe { Wtf8::from_mut_bytes_unchecked(self.bytes.leak()) }
457    }
458
459    /// Returns the number of bytes that this string buffer can hold without reallocating.
460    #[inline]
461    pub const fn capacity(&self) -> usize {
462        self.bytes.capacity()
463    }
464
465    /// Append a UTF-8 slice at the end of the string.
466    #[inline]
467    pub fn push_str(&mut self, other: &str) {
468        self.bytes.extend_from_slice(other.as_bytes())
469    }
470
471    /// Append a WTF-8 slice at the end of the string.
472    #[inline]
473    pub fn push_wtf8(&mut self, other: &Wtf8) {
474        self.bytes.extend_from_slice(&other.bytes);
475    }
476
477    /// Append a Unicode scalar value at the end of the string.
478    #[inline]
479    pub fn push_char(&mut self, c: char) {
480        self.push(CodePoint::from_char(c))
481    }
482
483    /// Append a code point at the end of the string.
484    #[inline]
485    pub fn push(&mut self, code_point: CodePoint) {
486        self.push_wtf8(code_point.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
487    }
488
489    pub fn pop(&mut self) -> Option<CodePoint> {
490        let ch = self.code_points().next_back()?;
491        let new_len = self.len() - ch.len_wtf8();
492        self.bytes.truncate(new_len);
493        Some(ch)
494    }
495
496    /// Shortens a string to the specified length.
497    ///
498    /// # Panics
499    ///
500    /// Panics if `new_len` > current length,
501    /// or if `new_len` is not a code point boundary.
502    #[inline]
503    pub fn truncate(&mut self, new_len: usize) {
504        assert!(is_code_point_boundary(self, new_len));
505        self.bytes.truncate(new_len)
506    }
507
508    /// Inserts a codepoint into this `Wtf8Buf` at a byte position.
509    #[inline]
510    pub fn insert(&mut self, idx: usize, c: CodePoint) {
511        self.insert_wtf8(idx, c.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
512    }
513
514    /// Inserts a WTF-8 slice into this `Wtf8Buf` at a byte position.
515    #[inline]
516    pub fn insert_wtf8(&mut self, idx: usize, w: &Wtf8) {
517        assert!(is_code_point_boundary(self, idx));
518
519        self.bytes.insert_str(idx, w)
520    }
521
522    /// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
523    #[inline]
524    pub fn into_bytes(self) -> Vec<u8> {
525        self.bytes
526    }
527
528    /// Consumes the WTF-8 string and tries to convert it to UTF-8.
529    ///
530    /// This does not copy the data.
531    ///
532    /// If the contents are not well-formed UTF-8
533    /// (that is, if the string contains surrogates),
534    /// the original WTF-8 string is returned instead.
535    pub fn into_string(self) -> Result<String, Wtf8Buf> {
536        if self.is_utf8() {
537            Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
538        } else {
539            Err(self)
540        }
541    }
542
543    /// Consumes the WTF-8 string and converts it lossily to UTF-8.
544    ///
545    /// This does not copy the data (but may overwrite parts of it in place).
546    ///
547    /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
548    pub fn into_string_lossy(mut self) -> String {
549        let mut pos = 0;
550        while let Some((surrogate_pos, _)) = self.next_surrogate(pos) {
551            pos = surrogate_pos + 3;
552            // Surrogates and the replacement character are all 3 bytes, so
553            // they can substituted in-place.
554            self.bytes[surrogate_pos..pos].copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
555        }
556        unsafe { String::from_utf8_unchecked(self.bytes) }
557    }
558
559    /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
560    #[inline]
561    pub fn into_box(self) -> Box<Wtf8> {
562        // SAFETY: relies on `Wtf8` being `repr(transparent)`.
563        unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
564    }
565
566    /// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
567    pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
568        let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
569        Wtf8Buf {
570            bytes: bytes.into_vec(),
571        }
572    }
573}
574
575/// Creates a new WTF-8 string from an iterator of code points.
576///
577/// This replaces surrogate code point pairs with supplementary code points,
578/// like concatenating ill-formed UTF-16 strings effectively would.
579impl FromIterator<CodePoint> for Wtf8Buf {
580    fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
581        let mut string = Wtf8Buf::new();
582        string.extend(iter);
583        string
584    }
585}
586
587/// Append code points from an iterator to the string.
588///
589/// This replaces surrogate code point pairs with supplementary code points,
590/// like concatenating ill-formed UTF-16 strings effectively would.
591impl Extend<CodePoint> for Wtf8Buf {
592    fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
593        let iterator = iter.into_iter();
594        let (low, _high) = iterator.size_hint();
595        // Lower bound of one byte per code point (ASCII only)
596        self.bytes.reserve(low);
597        iterator.for_each(move |code_point| self.push(code_point));
598    }
599}
600
601impl Extend<char> for Wtf8Buf {
602    fn extend<T: IntoIterator<Item = char>>(&mut self, iter: T) {
603        self.extend(iter.into_iter().map(CodePoint::from))
604    }
605}
606
607impl<W: AsRef<Wtf8>> Extend<W> for Wtf8Buf {
608    fn extend<T: IntoIterator<Item = W>>(&mut self, iter: T) {
609        iter.into_iter()
610            .for_each(move |w| self.push_wtf8(w.as_ref()));
611    }
612}
613
614impl<W: AsRef<Wtf8>> FromIterator<W> for Wtf8Buf {
615    fn from_iter<T: IntoIterator<Item = W>>(iter: T) -> Self {
616        let mut buf = Wtf8Buf::new();
617        iter.into_iter().for_each(|w| buf.push_wtf8(w.as_ref()));
618        buf
619    }
620}
621
622impl Hash for Wtf8Buf {
623    fn hash<H: Hasher>(&self, state: &mut H) {
624        Wtf8::hash(self, state)
625    }
626}
627
628impl AsRef<Wtf8> for Wtf8Buf {
629    fn as_ref(&self) -> &Wtf8 {
630        self
631    }
632}
633
634impl From<String> for Wtf8Buf {
635    fn from(s: String) -> Self {
636        Wtf8Buf::from_string(s)
637    }
638}
639
640impl From<&str> for Wtf8Buf {
641    fn from(s: &str) -> Self {
642        Wtf8Buf::from_string(s.to_owned())
643    }
644}
645
646impl From<ascii::AsciiString> for Wtf8Buf {
647    fn from(s: ascii::AsciiString) -> Self {
648        Wtf8Buf::from_string(s.into())
649    }
650}
651
652/// A borrowed slice of well-formed WTF-8 data.
653///
654/// Similar to `&str`, but can additionally contain surrogate code points
655/// if they’re not in a surrogate pair.
656#[derive(PartialEq, Eq, PartialOrd, Ord)]
657pub struct Wtf8 {
658    bytes: [u8],
659}
660
661impl AsRef<Wtf8> for Wtf8 {
662    fn as_ref(&self) -> &Wtf8 {
663        self
664    }
665}
666
667impl ToOwned for Wtf8 {
668    type Owned = Wtf8Buf;
669
670    fn to_owned(&self) -> Self::Owned {
671        self.to_wtf8_buf()
672    }
673
674    fn clone_into(&self, buf: &mut Self::Owned) {
675        self.bytes.clone_into(&mut buf.bytes);
676    }
677}
678
679impl PartialEq<str> for Wtf8 {
680    fn eq(&self, other: &str) -> bool {
681        self.as_bytes().eq(other.as_bytes())
682    }
683}
684
685/// Formats the string in double quotes, with characters escaped according to
686/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
687/// where each `x` is a hexadecimal digit.
688impl fmt::Debug for Wtf8 {
689    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
690        fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
691            use core::fmt::Write;
692            for c in s.chars().flat_map(|c| c.escape_debug()) {
693                f.write_char(c)?
694            }
695            Ok(())
696        }
697
698        formatter.write_str("\"")?;
699        let mut pos = 0;
700        while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
701            write_str_escaped(formatter, unsafe {
702                str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
703            })?;
704            write!(formatter, "\\u{{{surrogate:x}}}")?;
705            pos = surrogate_pos + 3;
706        }
707        write_str_escaped(formatter, unsafe {
708            str::from_utf8_unchecked(&self.bytes[pos..])
709        })?;
710        formatter.write_str("\"")
711    }
712}
713
714/// Formats the string with unpaired surrogates substituted with the replacement
715/// character, U+FFFD.
716impl fmt::Display for Wtf8 {
717    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
718        let wtf8_bytes = &self.bytes;
719        let mut pos = 0;
720        loop {
721            match self.next_surrogate(pos) {
722                Some((surrogate_pos, _)) => {
723                    formatter.write_str(unsafe {
724                        str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
725                    })?;
726                    formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
727                    pos = surrogate_pos + 3;
728                }
729                None => {
730                    let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
731                    if pos == 0 {
732                        return s.fmt(formatter);
733                    } else {
734                        return formatter.write_str(s);
735                    }
736                }
737            }
738        }
739    }
740}
741
742impl Default for &Wtf8 {
743    fn default() -> Self {
744        unsafe { Wtf8::from_bytes_unchecked(&[]) }
745    }
746}
747
748impl Hash for Wtf8 {
749    fn hash<H: Hasher>(&self, state: &mut H) {
750        state.write(self.as_bytes());
751        state.write_u8(0xff);
752    }
753}
754
755impl Wtf8 {
756    /// Creates a WTF-8 slice from a UTF-8 `&str` slice.
757    ///
758    /// Since WTF-8 is a superset of UTF-8, this always succeeds.
759    #[inline]
760    pub fn new<S: AsRef<Wtf8> + ?Sized>(value: &S) -> &Wtf8 {
761        value.as_ref()
762    }
763
764    /// Creates a WTF-8 slice from a WTF-8 byte slice.
765    ///
766    /// # Safety
767    ///
768    /// `value` must contain valid WTF-8.
769    #[inline]
770    pub const unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
771        // SAFETY: start with &[u8], end with fancy &[u8]
772        unsafe { &*(value as *const [u8] as *const Wtf8) }
773    }
774
775    /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice.
776    ///
777    /// Since the byte slice is not checked for valid WTF-8, this functions is
778    /// marked unsafe.
779    #[inline]
780    const unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
781        // SAFETY: start with &mut [u8], end with fancy &mut [u8]
782        unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
783    }
784
785    /// Create a WTF-8 slice from a WTF-8 byte slice.
786    //
787    // whoops! using WTF-8 for interchange!
788    #[inline]
789    pub fn from_bytes(b: &[u8]) -> Option<&Self> {
790        let mut rest = b;
791        while let Err(e) = core::str::from_utf8(rest) {
792            rest = &rest[e.valid_up_to()..];
793            let _ = Self::decode_surrogate(rest)?;
794            rest = &rest[3..];
795        }
796        Some(unsafe { Wtf8::from_bytes_unchecked(b) })
797    }
798
799    fn decode_surrogate(b: &[u8]) -> Option<CodePoint> {
800        let [0xed, b2 @ (0xa0..), b3, ..] = *b else {
801            return None;
802        };
803        Some(decode_surrogate(b2, b3).into())
804    }
805
806    /// Returns the length, in WTF-8 bytes.
807    #[inline]
808    pub const fn len(&self) -> usize {
809        self.bytes.len()
810    }
811
812    #[inline]
813    pub const fn is_empty(&self) -> bool {
814        self.bytes.is_empty()
815    }
816
817    /// Returns the code point at `position` if it is in the ASCII range,
818    /// or `b'\xFF'` otherwise.
819    ///
820    /// # Panics
821    ///
822    /// Panics if `position` is beyond the end of the string.
823    #[inline]
824    pub const fn ascii_byte_at(&self, position: usize) -> u8 {
825        match self.bytes[position] {
826            ascii_byte @ 0x00..=0x7F => ascii_byte,
827            _ => 0xFF,
828        }
829    }
830
831    /// Returns an iterator for the string’s code points.
832    #[inline]
833    pub fn code_points(&self) -> Wtf8CodePoints<'_> {
834        Wtf8CodePoints {
835            bytes: self.bytes.iter(),
836        }
837    }
838
839    /// Returns an iterator for the string’s code points and their indices.
840    #[inline]
841    pub fn code_point_indices(&self) -> Wtf8CodePointIndices<'_> {
842        Wtf8CodePointIndices {
843            front_offset: 0,
844            iter: self.code_points(),
845        }
846    }
847
848    /// Access raw bytes of WTF-8 data
849    #[inline]
850    pub const fn as_bytes(&self) -> &[u8] {
851        &self.bytes
852    }
853
854    /// Tries to convert the string to UTF-8 and return a `&str` slice.
855    ///
856    /// Returns `None` if the string contains surrogates.
857    ///
858    /// This does not copy the data.
859    #[inline]
860    pub const fn as_str(&self) -> Result<&str, str::Utf8Error> {
861        str::from_utf8(&self.bytes)
862    }
863
864    /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
865    pub fn to_wtf8_buf(&self) -> Wtf8Buf {
866        Wtf8Buf {
867            bytes: self.bytes.to_vec(),
868        }
869    }
870
871    /// Lossily converts the string to UTF-8.
872    /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
873    ///
874    /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
875    ///
876    /// This only copies the data if necessary (if it contains any surrogate).
877    pub fn to_string_lossy(&self) -> Cow<'_, str> {
878        let Some((surrogate_pos, _)) = self.next_surrogate(0) else {
879            return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) });
880        };
881        let wtf8_bytes = &self.bytes;
882        let mut utf8_bytes = Vec::with_capacity(self.len());
883        utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
884        utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
885        let mut pos = surrogate_pos + 3;
886        loop {
887            match self.next_surrogate(pos) {
888                Some((surrogate_pos, _)) => {
889                    utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
890                    utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
891                    pos = surrogate_pos + 3;
892                }
893                None => {
894                    utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
895                    return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
896                }
897            }
898        }
899    }
900
901    /// Converts the WTF-8 string to potentially ill-formed UTF-16
902    /// and return an iterator of 16-bit code units.
903    ///
904    /// This is lossless:
905    /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
906    /// would always return the original WTF-8 string.
907    #[inline]
908    pub fn encode_wide(&self) -> EncodeWide<'_> {
909        EncodeWide {
910            code_points: self.code_points(),
911            extra: 0,
912        }
913    }
914
915    pub const fn chunks(&self) -> Wtf8Chunks<'_> {
916        Wtf8Chunks { wtf8: self }
917    }
918
919    pub fn map_utf8<'a, I>(&'a self, f: impl Fn(&'a str) -> I) -> impl Iterator<Item = CodePoint>
920    where
921        I: Iterator<Item = char>,
922    {
923        self.chunks().flat_map(move |chunk| match chunk {
924            Wtf8Chunk::Utf8(s) => Either::Left(f(s).map_into()),
925            Wtf8Chunk::Surrogate(c) => Either::Right(core::iter::once(c)),
926        })
927    }
928
929    #[inline]
930    fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
931        let mut iter = self.bytes[pos..].iter();
932        loop {
933            let b = *iter.next()?;
934            if b < 0x80 {
935                pos += 1;
936            } else if b < 0xE0 {
937                iter.next();
938                pos += 2;
939            } else if b == 0xED {
940                match (iter.next(), iter.next()) {
941                    (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
942                        return Some((pos, decode_surrogate(b2, b3)));
943                    }
944                    _ => pos += 3,
945                }
946            } else if b < 0xF0 {
947                iter.next();
948                iter.next();
949                pos += 3;
950            } else {
951                iter.next();
952                iter.next();
953                iter.next();
954                pos += 4;
955            }
956        }
957    }
958
959    pub fn is_code_point_boundary(&self, index: usize) -> bool {
960        is_code_point_boundary(self, index)
961    }
962
963    /// Boxes this `Wtf8`.
964    #[inline]
965    pub fn into_box(&self) -> Box<Wtf8> {
966        let boxed: Box<[u8]> = self.bytes.into();
967        unsafe { mem::transmute(boxed) }
968    }
969
970    /// Creates a boxed, empty `Wtf8`.
971    pub fn empty_box() -> Box<Wtf8> {
972        let boxed: Box<[u8]> = Default::default();
973        unsafe { mem::transmute(boxed) }
974    }
975
976    #[inline]
977    pub fn make_ascii_lowercase(&mut self) {
978        self.bytes.make_ascii_lowercase()
979    }
980
981    #[inline]
982    pub fn make_ascii_uppercase(&mut self) {
983        self.bytes.make_ascii_uppercase()
984    }
985
986    #[inline]
987    pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
988        Wtf8Buf {
989            bytes: self.bytes.to_ascii_lowercase(),
990        }
991    }
992
993    #[inline]
994    pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
995        Wtf8Buf {
996            bytes: self.bytes.to_ascii_uppercase(),
997        }
998    }
999
1000    pub fn to_lowercase(&self) -> Wtf8Buf {
1001        let mut buf = Wtf8Buf::with_capacity(self.len());
1002        for chunk in self.chunks() {
1003            match chunk {
1004                Wtf8Chunk::Utf8(s) => buf.push_str(&s.to_lowercase()),
1005                Wtf8Chunk::Surrogate(c) => buf.push(c),
1006            }
1007        }
1008        buf
1009    }
1010
1011    pub fn to_uppercase(&self) -> Wtf8Buf {
1012        let mut buf = Wtf8Buf::with_capacity(self.len());
1013        for chunk in self.chunks() {
1014            match chunk {
1015                Wtf8Chunk::Utf8(s) => buf.push_str(&s.to_uppercase()),
1016                Wtf8Chunk::Surrogate(c) => buf.push(c),
1017            }
1018        }
1019        buf
1020    }
1021
1022    #[inline]
1023    pub const fn is_ascii(&self) -> bool {
1024        self.bytes.is_ascii()
1025    }
1026
1027    #[inline]
1028    pub fn is_utf8(&self) -> bool {
1029        self.next_surrogate(0).is_none()
1030    }
1031
1032    #[inline]
1033    pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
1034        self.bytes.eq_ignore_ascii_case(&other.bytes)
1035    }
1036
1037    pub fn split(&self, pat: &Wtf8) -> impl Iterator<Item = &Self> {
1038        self.as_bytes()
1039            .split_str(pat)
1040            .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1041    }
1042
1043    pub fn splitn(&self, n: usize, pat: &Wtf8) -> impl Iterator<Item = &Self> {
1044        self.as_bytes()
1045            .splitn_str(n, pat)
1046            .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1047    }
1048
1049    pub fn rsplit(&self, pat: &Wtf8) -> impl Iterator<Item = &Self> {
1050        self.as_bytes()
1051            .rsplit_str(pat)
1052            .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1053    }
1054
1055    pub fn rsplitn(&self, n: usize, pat: &Wtf8) -> impl Iterator<Item = &Self> {
1056        self.as_bytes()
1057            .rsplitn_str(n, pat)
1058            .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1059    }
1060
1061    pub fn trim(&self) -> &Self {
1062        let w = self.bytes.trim();
1063        unsafe { Wtf8::from_bytes_unchecked(w) }
1064    }
1065
1066    pub fn trim_start(&self) -> &Self {
1067        let w = self.bytes.trim_start();
1068        unsafe { Wtf8::from_bytes_unchecked(w) }
1069    }
1070
1071    pub fn trim_end(&self) -> &Self {
1072        let w = self.bytes.trim_end();
1073        unsafe { Wtf8::from_bytes_unchecked(w) }
1074    }
1075
1076    pub fn trim_start_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
1077        let mut iter = self.code_points();
1078        loop {
1079            let old = iter.clone();
1080            match iter.next().map(&f) {
1081                Some(true) => continue,
1082                Some(false) => {
1083                    iter = old;
1084                    break;
1085                }
1086                None => return iter.as_wtf8(),
1087            }
1088        }
1089        iter.as_wtf8()
1090    }
1091
1092    pub fn trim_end_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
1093        let mut iter = self.code_points();
1094        loop {
1095            let old = iter.clone();
1096            match iter.next_back().map(&f) {
1097                Some(true) => continue,
1098                Some(false) => {
1099                    iter = old;
1100                    break;
1101                }
1102                None => return iter.as_wtf8(),
1103            }
1104        }
1105        iter.as_wtf8()
1106    }
1107
1108    pub fn trim_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
1109        self.trim_start_matches(&f).trim_end_matches(&f)
1110    }
1111
1112    pub fn find(&self, pat: &Wtf8) -> Option<usize> {
1113        memchr::memmem::find(self.as_bytes(), pat.as_bytes())
1114    }
1115
1116    pub fn rfind(&self, pat: &Wtf8) -> Option<usize> {
1117        memchr::memmem::rfind(self.as_bytes(), pat.as_bytes())
1118    }
1119
1120    pub fn find_iter(&self, pat: &Wtf8) -> impl Iterator<Item = usize> {
1121        memchr::memmem::find_iter(self.as_bytes(), pat.as_bytes())
1122    }
1123
1124    pub fn rfind_iter(&self, pat: &Wtf8) -> impl Iterator<Item = usize> {
1125        memchr::memmem::rfind_iter(self.as_bytes(), pat.as_bytes())
1126    }
1127
1128    pub fn contains(&self, pat: &Wtf8) -> bool {
1129        self.bytes.contains_str(pat)
1130    }
1131
1132    pub fn contains_code_point(&self, pat: CodePoint) -> bool {
1133        self.bytes
1134            .contains_str(pat.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
1135    }
1136
1137    pub fn get(&self, range: impl ops::RangeBounds<usize>) -> Option<&Self> {
1138        let start = match range.start_bound() {
1139            ops::Bound::Included(&i) => i,
1140            ops::Bound::Excluded(&i) => i.saturating_add(1),
1141            ops::Bound::Unbounded => 0,
1142        };
1143        let end = match range.end_bound() {
1144            ops::Bound::Included(&i) => i.saturating_add(1),
1145            ops::Bound::Excluded(&i) => i,
1146            ops::Bound::Unbounded => self.len(),
1147        };
1148        // is_code_point_boundary checks that the index is in [0, .len()]
1149        if start <= end && is_code_point_boundary(self, start) && is_code_point_boundary(self, end)
1150        {
1151            Some(unsafe { slice_unchecked(self, start, end) })
1152        } else {
1153            None
1154        }
1155    }
1156
1157    pub fn ends_with(&self, w: impl AsRef<Wtf8>) -> bool {
1158        self.bytes.ends_with_str(w.as_ref())
1159    }
1160
1161    pub fn starts_with(&self, w: impl AsRef<Wtf8>) -> bool {
1162        self.bytes.starts_with_str(w.as_ref())
1163    }
1164
1165    pub fn strip_prefix(&self, w: impl AsRef<Wtf8>) -> Option<&Self> {
1166        self.bytes
1167            .strip_prefix(w.as_ref().as_bytes())
1168            .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1169    }
1170
1171    pub fn strip_suffix(&self, w: impl AsRef<Wtf8>) -> Option<&Self> {
1172        self.bytes
1173            .strip_suffix(w.as_ref().as_bytes())
1174            .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1175    }
1176
1177    pub fn replace(&self, from: &Wtf8, to: &Wtf8) -> Wtf8Buf {
1178        let w = self.bytes.replace(from, to);
1179        unsafe { Wtf8Buf::from_bytes_unchecked(w) }
1180    }
1181
1182    pub fn replacen(&self, from: &Wtf8, to: &Wtf8, n: usize) -> Wtf8Buf {
1183        let w = self.bytes.replacen(from, to, n);
1184        unsafe { Wtf8Buf::from_bytes_unchecked(w) }
1185    }
1186}
1187
1188impl AsRef<Wtf8> for str {
1189    fn as_ref(&self) -> &Wtf8 {
1190        unsafe { Wtf8::from_bytes_unchecked(self.as_bytes()) }
1191    }
1192}
1193
1194impl AsRef<[u8]> for Wtf8 {
1195    fn as_ref(&self) -> &[u8] {
1196        self.as_bytes()
1197    }
1198}
1199
1200/// Returns a slice of the given string for the byte range \[`begin`..`end`).
1201///
1202/// # Panics
1203///
1204/// Panics when `begin` and `end` do not point to code point boundaries,
1205/// or point beyond the end of the string.
1206impl ops::Index<ops::Range<usize>> for Wtf8 {
1207    type Output = Wtf8;
1208
1209    #[inline]
1210    #[track_caller]
1211    fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
1212        // is_code_point_boundary checks that the index is in [0, .len()]
1213        if range.start <= range.end
1214            && is_code_point_boundary(self, range.start)
1215            && is_code_point_boundary(self, range.end)
1216        {
1217            unsafe { slice_unchecked(self, range.start, range.end) }
1218        } else {
1219            slice_error_fail(self, range.start, range.end)
1220        }
1221    }
1222}
1223
1224/// Returns a slice of the given string from byte `begin` to its end.
1225///
1226/// # Panics
1227///
1228/// Panics when `begin` is not at a code point boundary,
1229/// or is beyond the end of the string.
1230impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
1231    type Output = Wtf8;
1232
1233    #[inline]
1234    #[track_caller]
1235    fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
1236        // is_code_point_boundary checks that the index is in [0, .len()]
1237        if is_code_point_boundary(self, range.start) {
1238            unsafe { slice_unchecked(self, range.start, self.len()) }
1239        } else {
1240            slice_error_fail(self, range.start, self.len())
1241        }
1242    }
1243}
1244
1245/// Returns a slice of the given string from its beginning to byte `end`.
1246///
1247/// # Panics
1248///
1249/// Panics when `end` is not at a code point boundary,
1250/// or is beyond the end of the string.
1251impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
1252    type Output = Wtf8;
1253
1254    #[inline]
1255    #[track_caller]
1256    fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
1257        // is_code_point_boundary checks that the index is in [0, .len()]
1258        if is_code_point_boundary(self, range.end) {
1259            unsafe { slice_unchecked(self, 0, range.end) }
1260        } else {
1261            slice_error_fail(self, 0, range.end)
1262        }
1263    }
1264}
1265
1266impl ops::Index<ops::RangeFull> for Wtf8 {
1267    type Output = Wtf8;
1268
1269    #[inline]
1270    fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
1271        self
1272    }
1273}
1274
1275#[inline]
1276const fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
1277    // The first byte is assumed to be 0xED
1278    0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
1279}
1280
1281#[inline]
1282const fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
1283    let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
1284    unsafe { char::from_u32_unchecked(code_point) }
1285}
1286
1287/// Copied from str::is_char_boundary
1288#[inline]
1289fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
1290    if index == 0 {
1291        return true;
1292    }
1293    match slice.bytes.get(index) {
1294        None => index == slice.len(),
1295        Some(&b) => (b as i8) >= -0x40,
1296    }
1297}
1298
1299/// Verify that `index` is at the edge of either a valid UTF-8 codepoint
1300/// (i.e. a codepoint that's not a surrogate) or of the whole string.
1301///
1302/// These are the cases currently permitted by `OsStr::slice_encoded_bytes`.
1303/// Splitting between surrogates is valid as far as WTF-8 is concerned, but
1304/// we do not permit it in the public API because WTF-8 is considered an
1305/// implementation detail.
1306#[track_caller]
1307#[inline]
1308pub fn check_utf8_boundary(slice: &Wtf8, index: usize) {
1309    if index == 0 {
1310        return;
1311    }
1312    match slice.bytes.get(index) {
1313        Some(0xED) => (), // Might be a surrogate
1314        Some(&b) if (b as i8) >= -0x40 => return,
1315        Some(_) => panic!("byte index {index} is not a codepoint boundary"),
1316        None if index == slice.len() => return,
1317        None => panic!("byte index {index} is out of bounds"),
1318    }
1319    if slice.bytes[index + 1] >= 0xA0 {
1320        // There's a surrogate after index. Now check before index.
1321        if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 {
1322            panic!("byte index {index} lies between surrogate codepoints");
1323        }
1324    }
1325}
1326
1327/// Copied from core::str::raw::slice_unchecked
1328///
1329/// # Safety
1330///
1331/// `begin` and `end` must be within bounds and on codepoint boundaries.
1332#[inline]
1333pub const unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
1334    // SAFETY: memory layout of a &[u8] and &Wtf8 are the same
1335    unsafe {
1336        let len = end - begin;
1337        let start = s.as_bytes().as_ptr().add(begin);
1338        Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len))
1339    }
1340}
1341
1342/// Copied from core::str::raw::slice_error_fail
1343#[inline(never)]
1344#[track_caller]
1345pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
1346    assert!(begin <= end);
1347    panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary");
1348}
1349
1350/// Iterator for the code points of a WTF-8 string.
1351///
1352/// Created with the method `.code_points()`.
1353#[derive(Clone)]
1354pub struct Wtf8CodePoints<'a> {
1355    bytes: slice::Iter<'a, u8>,
1356}
1357
1358impl Iterator for Wtf8CodePoints<'_> {
1359    type Item = CodePoint;
1360
1361    #[inline]
1362    fn next(&mut self) -> Option<CodePoint> {
1363        // SAFETY: `self.bytes` has been created from a WTF-8 string
1364        unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
1365    }
1366
1367    #[inline]
1368    fn size_hint(&self) -> (usize, Option<usize>) {
1369        let len = self.bytes.len();
1370        (len.saturating_add(3) / 4, Some(len))
1371    }
1372
1373    fn last(mut self) -> Option<Self::Item> {
1374        self.next_back()
1375    }
1376
1377    fn count(self) -> usize {
1378        core_str_count::count_chars(self.as_wtf8())
1379    }
1380}
1381
1382impl DoubleEndedIterator for Wtf8CodePoints<'_> {
1383    #[inline]
1384    fn next_back(&mut self) -> Option<CodePoint> {
1385        // SAFETY: `str` invariant says `self.iter` is a valid WTF-8 string and
1386        // the resulting `ch` is a valid Unicode Code Point.
1387        unsafe {
1388            next_code_point_reverse(&mut self.bytes).map(|ch| CodePoint::from_u32_unchecked(ch))
1389        }
1390    }
1391}
1392
1393impl<'a> Wtf8CodePoints<'a> {
1394    pub fn as_wtf8(&self) -> &'a Wtf8 {
1395        unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) }
1396    }
1397}
1398
1399#[derive(Clone)]
1400pub struct Wtf8CodePointIndices<'a> {
1401    front_offset: usize,
1402    iter: Wtf8CodePoints<'a>,
1403}
1404
1405impl Iterator for Wtf8CodePointIndices<'_> {
1406    type Item = (usize, CodePoint);
1407
1408    #[inline]
1409    fn next(&mut self) -> Option<(usize, CodePoint)> {
1410        let pre_len = self.iter.bytes.len();
1411        match self.iter.next() {
1412            None => None,
1413            Some(ch) => {
1414                let index = self.front_offset;
1415                let len = self.iter.bytes.len();
1416                self.front_offset += pre_len - len;
1417                Some((index, ch))
1418            }
1419        }
1420    }
1421
1422    #[inline]
1423    fn size_hint(&self) -> (usize, Option<usize>) {
1424        self.iter.size_hint()
1425    }
1426
1427    #[inline]
1428    fn last(mut self) -> Option<(usize, CodePoint)> {
1429        // No need to go through the entire string.
1430        self.next_back()
1431    }
1432
1433    #[inline]
1434    fn count(self) -> usize {
1435        self.iter.count()
1436    }
1437}
1438
1439impl DoubleEndedIterator for Wtf8CodePointIndices<'_> {
1440    #[inline]
1441    fn next_back(&mut self) -> Option<(usize, CodePoint)> {
1442        self.iter.next_back().map(|ch| {
1443            let index = self.front_offset + self.iter.bytes.len();
1444            (index, ch)
1445        })
1446    }
1447}
1448
1449impl FusedIterator for Wtf8CodePointIndices<'_> {}
1450
1451/// Generates a wide character sequence for potentially ill-formed UTF-16.
1452#[derive(Clone)]
1453pub struct EncodeWide<'a> {
1454    code_points: Wtf8CodePoints<'a>,
1455    extra: u16,
1456}
1457
1458// Copied from libunicode/u_str.rs
1459impl Iterator for EncodeWide<'_> {
1460    type Item = u16;
1461
1462    #[inline]
1463    fn next(&mut self) -> Option<u16> {
1464        if self.extra != 0 {
1465            let tmp = self.extra;
1466            self.extra = 0;
1467            return Some(tmp);
1468        }
1469
1470        let mut buf = [0; MAX_LEN_UTF16];
1471        self.code_points.next().map(|code_point| {
1472            let n = encode_utf16_raw(code_point.value, &mut buf).len();
1473            if n == 2 {
1474                self.extra = buf[1];
1475            }
1476            buf[0]
1477        })
1478    }
1479
1480    #[inline]
1481    fn size_hint(&self) -> (usize, Option<usize>) {
1482        let (low, high) = self.code_points.size_hint();
1483        let ext = (self.extra != 0) as usize;
1484        // every code point gets either one u16 or two u16,
1485        // so this iterator is between 1 or 2 times as
1486        // long as the underlying iterator.
1487        (
1488            low + ext,
1489            high.and_then(|n| n.checked_mul(2))
1490                .and_then(|n| n.checked_add(ext)),
1491        )
1492    }
1493}
1494
1495impl FusedIterator for EncodeWide<'_> {}
1496
1497pub struct Wtf8Chunks<'a> {
1498    wtf8: &'a Wtf8,
1499}
1500
1501impl<'a> Iterator for Wtf8Chunks<'a> {
1502    type Item = Wtf8Chunk<'a>;
1503
1504    fn next(&mut self) -> Option<Self::Item> {
1505        match self.wtf8.next_surrogate(0) {
1506            Some((0, surrogate)) => {
1507                self.wtf8 = &self.wtf8[3..];
1508                Some(Wtf8Chunk::Surrogate(surrogate.into()))
1509            }
1510            Some((n, _)) => {
1511                let s = unsafe { str::from_utf8_unchecked(&self.wtf8.as_bytes()[..n]) };
1512                self.wtf8 = &self.wtf8[n..];
1513                Some(Wtf8Chunk::Utf8(s))
1514            }
1515            None => {
1516                let s =
1517                    unsafe { str::from_utf8_unchecked(core::mem::take(&mut self.wtf8).as_bytes()) };
1518                (!s.is_empty()).then_some(Wtf8Chunk::Utf8(s))
1519            }
1520        }
1521    }
1522}
1523
1524pub enum Wtf8Chunk<'a> {
1525    Utf8(&'a str),
1526    Surrogate(CodePoint),
1527}
1528
1529impl Hash for CodePoint {
1530    #[inline]
1531    fn hash<H: Hasher>(&self, state: &mut H) {
1532        self.value.hash(state)
1533    }
1534}
1535
1536// == BOX IMPLS ==
1537
1538/// # Safety
1539///
1540/// `value` must be valid WTF-8.
1541pub unsafe fn from_boxed_wtf8_unchecked(value: Box<[u8]>) -> Box<Wtf8> {
1542    unsafe { Box::from_raw(Box::into_raw(value) as *mut Wtf8) }
1543}
1544
1545impl Clone for Box<Wtf8> {
1546    fn clone(&self) -> Self {
1547        (&**self).into()
1548    }
1549}
1550
1551impl Default for Box<Wtf8> {
1552    fn default() -> Self {
1553        unsafe { from_boxed_wtf8_unchecked(Box::default()) }
1554    }
1555}
1556
1557impl From<&Wtf8> for Box<Wtf8> {
1558    fn from(w: &Wtf8) -> Self {
1559        w.into_box()
1560    }
1561}
1562
1563impl<'a> From<&'a str> for &'a Wtf8 {
1564    #[inline]
1565    fn from(s: &'a str) -> &'a Wtf8 {
1566        // Valid UTF-8 is always valid WTF-8
1567        unsafe { Wtf8::from_bytes_unchecked(s.as_bytes()) }
1568    }
1569}
1570
1571impl From<&str> for Box<Wtf8> {
1572    fn from(s: &str) -> Self {
1573        Box::<str>::from(s).into()
1574    }
1575}
1576
1577impl From<Box<str>> for Box<Wtf8> {
1578    fn from(s: Box<str>) -> Self {
1579        unsafe { from_boxed_wtf8_unchecked(s.into_boxed_bytes()) }
1580    }
1581}
1582
1583impl From<Box<ascii::AsciiStr>> for Box<Wtf8> {
1584    fn from(s: Box<ascii::AsciiStr>) -> Self {
1585        <Box<str>>::from(s).into()
1586    }
1587}
1588
1589impl From<Box<Wtf8>> for Box<[u8]> {
1590    fn from(w: Box<Wtf8>) -> Self {
1591        unsafe { Box::from_raw(Box::into_raw(w) as *mut [u8]) }
1592    }
1593}
1594
1595impl From<Wtf8Buf> for Box<Wtf8> {
1596    fn from(w: Wtf8Buf) -> Self {
1597        w.into_box()
1598    }
1599}
1600
1601impl From<Box<Wtf8>> for Wtf8Buf {
1602    fn from(w: Box<Wtf8>) -> Self {
1603        Wtf8Buf::from_box(w)
1604    }
1605}
1606
1607impl From<String> for Box<Wtf8> {
1608    fn from(s: String) -> Self {
1609        s.into_boxed_str().into()
1610    }
1611}
1612
1613mod concat;
1614pub use concat::Wtf8Concat;
rustpython_wtf8/lib.rs

rustpython_wtf8/
lib.rs