wtf8_rs/wtf8buf/
mod.rs

1//! A WTF-8 dynamically sized, growable string.
2
3use crate::{decode_surrogate, decode_surrogate_pair, CodePoint, Wtf8};
4use alloc::borrow::ToOwned;
5use alloc::boxed::Box;
6use alloc::string::String;
7use alloc::vec::Vec;
8use core::borrow::{Borrow, BorrowMut};
9use core::convert::Infallible;
10use core::iter::FromIterator;
11use core::ops::{Deref, DerefMut};
12use core::str::FromStr;
13use core::{char, fmt};
14
15#[cfg(test)]
16mod tests;
17
18/// A WTF-8 dynamically sized, growable string.
19#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
20pub struct Wtf8Buf {
21    bytes: Vec<u8>,
22}
23
24impl Wtf8Buf {
25    #[inline]
26    pub(crate) fn from_bytes(x: Vec<u8>) -> Wtf8Buf {
27        Self { bytes: x }
28    }
29
30    /// Creates a new, empty WTF-8 string.
31    #[inline]
32    pub const fn new() -> Wtf8Buf {
33        Wtf8Buf { bytes: Vec::new() }
34    }
35
36    /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
37    #[inline]
38    pub fn with_capacity(capacity: usize) -> Wtf8Buf {
39        Wtf8Buf {
40            bytes: Vec::with_capacity(capacity),
41        }
42    }
43
44    /// Creates a WTF-8 string from a UTF-8 `String`.
45    ///
46    /// This takes ownership of the `String` and does not copy.
47    ///
48    /// Since WTF-8 is a superset of UTF-8, this always succeeds.
49    #[inline]
50    pub fn from_string(string: String) -> Wtf8Buf {
51        Wtf8Buf {
52            bytes: string.into_bytes(),
53        }
54    }
55
56    /// Reserves capacity for at least `additional` more bytes to be inserted
57    /// in the given `Wtf8Buf`.
58    /// The collection may reserve more space to avoid frequent reallocations.
59    ///
60    /// # Panics
61    ///
62    /// Panics if the new capacity overflows `usize`.
63    #[inline]
64    pub fn reserve(&mut self, additional: usize) {
65        self.bytes.reserve(additional)
66    }
67
68    /// Reserves the minimum capacity for exactly `additional` more elements to
69    /// be inserted in the given `Wtf8Buf`. After calling `reserve_exact`,
70    /// capacity will be greater than or equal to `self.len() + additional`.
71    /// Does nothing if the capacity is already sufficient.
72    ///
73    /// Note that the allocator may give the collection more space than it
74    /// requests. Therefore, capacity can not be relied upon to be precisely
75    /// minimal. Prefer `reserve` if future insertions are expected.
76    ///
77    /// # Panics
78    ///
79    /// Panics if the new capacity overflows `usize`.
80    #[inline]
81    pub fn reserve_exact(&mut self, additional: usize) {
82        self.bytes.reserve_exact(additional)
83    }
84
85    /// Shrinks the capacity of the vector as much as possible.
86    ///
87    /// It will drop down as close as possible to the length but the allocator
88    /// may still inform the vector that there is space for a few more elements.
89    #[inline]
90    pub fn shrink_to_fit(&mut self) {
91        self.bytes.shrink_to_fit()
92    }
93
94    /// Returns the number of bytes that this string buffer can hold without reallocating.
95    #[inline]
96    pub fn capacity(&self) -> usize {
97        self.bytes.capacity()
98    }
99
100    /// Creates a WTF-8 string from a UTF-8 `&str` slice.
101    ///
102    /// This copies the content of the slice.
103    ///
104    /// Since WTF-8 is a superset of UTF-8, this always succeeds.
105    #[inline]
106    #[allow(clippy::should_implement_trait)]
107    pub fn from_str(str: &str) -> Wtf8Buf {
108        Wtf8Buf {
109            bytes: <[_]>::to_vec(str.as_bytes()),
110        }
111    }
112
113    /// Clears the string.
114    #[inline]
115    pub fn clear(&mut self) {
116        self.bytes.clear()
117    }
118
119    /// Creates a WTF-8 string from a potentially ill-formed UTF-16 iterator of 16-bit code units.
120    ///
121    /// This is lossless: calling `.encode_utf16()` on the resulting string
122    /// will always return the original code units.
123    pub fn from_utf16<I>(v: I) -> Wtf8Buf
124    where
125        I: IntoIterator<Item = u16>,
126    {
127        let iter = v.into_iter();
128        let mut string = Wtf8Buf::with_capacity(iter.size_hint().0);
129        for item in char::decode_utf16(iter) {
130            match item {
131                Ok(ch) => string.push_char(ch),
132                Err(surrogate) => {
133                    let surrogate = surrogate.unpaired_surrogate();
134                    // Surrogates are known to be in the code point range.
135                    let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
136                    // Skip the WTF-8 concatenation check,
137                    // surrogate pairs are already decoded by decode_utf16
138                    string.push_code_point_unchecked(code_point)
139                }
140            }
141        }
142        string
143    }
144
145    /// Returns the slice of this object.
146    #[inline]
147    pub fn as_wtf8(&self) -> &Wtf8 {
148        // Safety: Wtf8 is transparent, type layouts match.
149        unsafe { &*(self.bytes.as_slice() as *const [u8] as *const Wtf8) }
150    }
151
152    /// Returns the slice of this object.
153    #[inline]
154    pub fn as_mut_wtf8(&mut self) -> &mut Wtf8 {
155        // Safety: Wtf8 is transparent, type layouts match.
156        unsafe { &mut *(self.bytes.as_mut_slice() as *mut [u8] as *mut Wtf8) }
157    }
158
159    /// Append a UTF-8 slice at the end of the string.
160    #[inline]
161    pub fn push_str(&mut self, other: &str) {
162        self.bytes.extend_from_slice(other.as_bytes())
163    }
164
165    /// Append a string with WTF-8 encoding.
166    ///
167    /// This replaces newly paired surrogates at the boundary
168    /// with a supplementary code point,
169    /// like concatenating ill-formed UTF-16 strings effectively would.
170    #[inline]
171    pub fn push_wtf8(&mut self, other: &Wtf8) {
172        match (
173            (&*self).final_lead_surrogate(),
174            other.initial_trail_surrogate(),
175        ) {
176            // Replace newly paired surrogates by a supplementary code point.
177            (Some(lead), Some(trail)) => {
178                let len_without_lead_surrogate = self.len() - 3;
179                self.bytes.truncate(len_without_lead_surrogate);
180                let other_without_trail_surrogate = &other.bytes()[3..];
181                // 4 bytes for the supplementary code point
182                self.bytes.reserve(4 + other_without_trail_surrogate.len());
183                self.push_char(decode_surrogate_pair(lead, trail));
184                self.bytes.extend_from_slice(other_without_trail_surrogate);
185            }
186            _ => self.bytes.extend_from_slice(other.bytes()),
187        }
188    }
189
190    /// Append a Unicode scalar value at the end of the string.
191    #[inline]
192    pub fn push_char(&mut self, c: char) {
193        self.push_code_point_unchecked(CodePoint::from_char(c))
194    }
195
196    /// Append a code point at the end of the string.
197    ///
198    /// This replaces newly paired surrogates at the boundary
199    /// with a supplementary code point,
200    /// like concatenating ill-formed UTF-16 strings effectively would.
201    #[inline]
202    pub fn push(&mut self, code_point: CodePoint) {
203        if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() {
204            if let Some(lead) = self.final_lead_surrogate() {
205                let len_without_lead_surrogate = self.len() - 3;
206                self.bytes.truncate(len_without_lead_surrogate);
207                self.push_char(decode_surrogate_pair(lead, trail as u16));
208                return;
209            }
210        }
211
212        // No newly paired surrogates at the boundary.
213        self.push_code_point_unchecked(code_point)
214    }
215
216    /// Shortens a string to the specified length.
217    ///
218    /// # Panics
219    ///
220    /// Panics if `new_len` > current length,
221    /// or if `new_len` is not a code point boundary.
222    #[inline]
223    pub fn truncate(&mut self, new_len: usize) {
224        assert!(self.is_code_point_boundary(new_len));
225        self.bytes.truncate(new_len)
226    }
227
228    /// Consumes the WTF-8 string and tries to convert it to UTF-8.
229    ///
230    /// This does not copy the data.
231    ///
232    /// If the contents are not well-formed UTF-8
233    /// (that is, if the string contains surrogates),
234    /// the original WTF-8 string is returned instead.
235    pub fn into_string(self) -> Result<String, IntoStringError> {
236        let chunks = self.chunks();
237
238        match chunks.next_surrogate() {
239            Some(position) => Err(IntoStringError {
240                wtf8: self,
241                valid_up_to: position,
242            }),
243            // Safety: No surrogates, so UTF-8 is guaranteed.
244            None => unsafe { Ok(String::from_utf8_unchecked(self.bytes)) },
245        }
246    }
247
248    /// Consumes the WTF-8 string and converts it lossily to UTF-8.
249    ///
250    /// This does not copy the data (but may overwrite parts of it in place).
251    ///
252    /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
253    pub fn into_string_lossy(self) -> String {
254        let chunks = self.chunks();
255
256        if chunks.next_surrogate().is_none() {
257            // Safety: No surrogates, so UTF-8 is guaranteed.
258            unsafe { String::from_utf8_unchecked(self.bytes) }
259        } else {
260            self.to_string_lossy().into_owned()
261        }
262    }
263
264    /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
265    #[inline]
266    pub fn into_box(self) -> Box<Wtf8> {
267        // Safety: type layouts match.
268        unsafe { Box::from_raw(Box::into_raw(self.bytes.into_boxed_slice()) as *mut Wtf8) }
269    }
270
271    /// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
272    pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
273        // Safety: type layouts are the same.
274        let bytes: Box<[u8]> = unsafe { Box::from_raw(Box::into_raw(boxed) as *mut [u8]) };
275        Wtf8Buf {
276            bytes: bytes.into_vec(),
277        }
278    }
279
280    #[inline]
281    fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
282        const TAG_CONT: u8 = 0b1000_0000;
283        const TAG_TWO_B: u8 = 0b1100_0000;
284        const TAG_THREE_B: u8 = 0b1110_0000;
285        const TAG_FOUR_B: u8 = 0b1111_0000;
286        const MAX_ONE_B: u32 = 0x80;
287        const MAX_TWO_B: u32 = 0x800;
288        const MAX_THREE_B: u32 = 0x10000;
289
290        #[inline]
291        const fn len_utf8(code: u32) -> usize {
292            if code < MAX_ONE_B {
293                1
294            } else if code < MAX_TWO_B {
295                2
296            } else if code < MAX_THREE_B {
297                3
298            } else {
299                4
300            }
301        }
302
303        #[inline]
304        fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
305            let len = len_utf8(code);
306            #[allow(clippy::redundant_slicing)]
307            match (len, &mut dst[..]) {
308                (1, [a, ..]) => {
309                    *a = code as u8;
310                }
311                (2, [a, b, ..]) => {
312                    *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
313                    *b = (code & 0x3F) as u8 | TAG_CONT;
314                }
315                (3, [a, b, c, ..]) => {
316                    *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
317                    *b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
318                    *c = (code & 0x3F) as u8 | TAG_CONT;
319                }
320                (4, [a, b, c, d, ..]) => {
321                    *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
322                    *b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
323                    *c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
324                    *d = (code & 0x3F) as u8 | TAG_CONT;
325                }
326                _ => panic!(
327                    "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
328                    len,
329                    code,
330                    dst.len(),
331                ),
332            };
333            &mut dst[..len]
334        }
335
336        let mut bytes = [0; 4];
337        let bytes = encode_utf8_raw(code_point.to_u32(), &mut bytes);
338        self.bytes.extend_from_slice(bytes)
339    }
340
341    #[inline]
342    fn final_lead_surrogate(&self) -> Option<u16> {
343        match self.bytes() {
344            [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(*b2, *b3)),
345            _ => None,
346        }
347    }
348}
349
350impl Deref for Wtf8Buf {
351    type Target = Wtf8;
352    #[inline]
353    fn deref(&self) -> &Wtf8 {
354        self.as_wtf8()
355    }
356}
357
358impl DerefMut for Wtf8Buf {
359    #[inline]
360    fn deref_mut(&mut self) -> &mut Wtf8 {
361        self.as_mut_wtf8()
362    }
363}
364
365impl From<String> for Wtf8Buf {
366    #[inline]
367    fn from(x: String) -> Wtf8Buf {
368        Wtf8Buf::from_string(x)
369    }
370}
371impl From<&str> for Wtf8Buf {
372    #[inline]
373    fn from(x: &str) -> Wtf8Buf {
374        Wtf8Buf::from_str(x)
375    }
376}
377impl From<&Wtf8> for Wtf8Buf {
378    #[inline]
379    fn from(x: &Wtf8) -> Wtf8Buf {
380        x.to_owned()
381    }
382}
383
384impl AsRef<Wtf8> for Wtf8Buf {
385    #[inline]
386    fn as_ref(&self) -> &Wtf8 {
387        self
388    }
389}
390impl Borrow<Wtf8> for Wtf8Buf {
391    #[inline]
392    fn borrow(&self) -> &Wtf8 {
393        self
394    }
395}
396impl AsMut<Wtf8> for Wtf8Buf {
397    #[inline]
398    fn as_mut(&mut self) -> &mut Wtf8 {
399        self
400    }
401}
402impl BorrowMut<Wtf8> for Wtf8Buf {
403    #[inline]
404    fn borrow_mut(&mut self) -> &mut Wtf8 {
405        self
406    }
407}
408
409impl FromStr for Wtf8Buf {
410    type Err = Infallible;
411
412    #[inline]
413    fn from_str(s: &str) -> Result<Self, Infallible> {
414        Ok(Wtf8Buf::from_str(s))
415    }
416}
417
418impl ToOwned for Wtf8 {
419    type Owned = Wtf8Buf;
420
421    #[inline]
422    fn to_owned(&self) -> Wtf8Buf {
423        Wtf8Buf {
424            bytes: self.bytes().to_owned(),
425        }
426    }
427}
428
429/// Creates a new WTF-8 string from an iterator of code points.
430///
431/// This replaces surrogate code point pairs with supplementary code points,
432/// like concatenating ill-formed UTF-16 strings effectively would.
433impl FromIterator<CodePoint> for Wtf8Buf {
434    fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
435        let mut string = Wtf8Buf::new();
436        string.extend(iter);
437        string
438    }
439}
440
441impl FromIterator<char> for Wtf8Buf {
442    fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> Wtf8Buf {
443        let mut string = Wtf8Buf::new();
444        string.extend(iter);
445        string
446    }
447}
448
449impl<'a> FromIterator<&'a Wtf8> for Wtf8Buf {
450    fn from_iter<T: IntoIterator<Item = &'a Wtf8>>(iter: T) -> Wtf8Buf {
451        let mut string = Wtf8Buf::new();
452        string.extend(iter);
453        string
454    }
455}
456
457impl<'a> FromIterator<&'a str> for Wtf8Buf {
458    fn from_iter<T: IntoIterator<Item = &'a str>>(iter: T) -> Wtf8Buf {
459        let mut string = Wtf8Buf::new();
460        string.extend(iter);
461        string
462    }
463}
464
465impl<'a> FromIterator<&'a CodePoint> for Wtf8Buf {
466    fn from_iter<T: IntoIterator<Item = &'a CodePoint>>(iter: T) -> Wtf8Buf {
467        let mut string = Wtf8Buf::new();
468        string.extend(iter);
469        string
470    }
471}
472
473impl<'a> FromIterator<&'a char> for Wtf8Buf {
474    fn from_iter<T: IntoIterator<Item = &'a char>>(iter: T) -> Wtf8Buf {
475        let mut string = Wtf8Buf::new();
476        string.extend(iter);
477        string
478    }
479}
480
481/// Append code points from an iterator to the string.
482///
483/// This replaces surrogate code point pairs with supplementary code points,
484/// like concatenating ill-formed UTF-16 strings effectively would.
485impl Extend<CodePoint> for Wtf8Buf {
486    fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
487        let iterator = iter.into_iter();
488        let (low, _high) = iterator.size_hint();
489        // Lower bound of one byte per code point (ASCII only)
490        self.bytes.reserve(low);
491        for code_point in iterator {
492            self.push(code_point);
493        }
494    }
495}
496
497impl Extend<char> for Wtf8Buf {
498    fn extend<T: IntoIterator<Item = char>>(&mut self, iter: T) {
499        let iterator = iter.into_iter();
500        let (low, _high) = iterator.size_hint();
501        self.bytes.reserve(low);
502        for c in iterator {
503            self.push_char(c);
504        }
505    }
506}
507
508impl<'a> Extend<&'a str> for Wtf8Buf {
509    fn extend<T: IntoIterator<Item = &'a str>>(&mut self, iter: T) {
510        let iterator = iter.into_iter();
511        let (low, _high) = iterator.size_hint();
512        self.bytes.reserve(low);
513        for c in iterator {
514            self.push_str(c);
515        }
516    }
517}
518
519impl<'a> Extend<&'a Wtf8> for Wtf8Buf {
520    fn extend<T: IntoIterator<Item = &'a Wtf8>>(&mut self, iter: T) {
521        let iterator = iter.into_iter();
522        let (low, _high) = iterator.size_hint();
523        self.bytes.reserve(low);
524        for c in iterator {
525            self.push_wtf8(c);
526        }
527    }
528}
529
530impl<'a> Extend<&'a CodePoint> for Wtf8Buf {
531    #[inline]
532    fn extend<T: IntoIterator<Item = &'a CodePoint>>(&mut self, iter: T) {
533        self.extend(iter.into_iter().copied())
534    }
535}
536
537impl<'a> Extend<&'a char> for Wtf8Buf {
538    #[inline]
539    fn extend<T: IntoIterator<Item = &'a char>>(&mut self, iter: T) {
540        self.extend(iter.into_iter().copied())
541    }
542}
543
544impl fmt::Debug for Wtf8Buf {
545    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
546        fmt::Debug::fmt(self.as_wtf8(), f)
547    }
548}
549
550impl fmt::Display for Wtf8Buf {
551    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
552        fmt::Display::fmt(self.as_wtf8(), f)
553    }
554}
555
556/// Errors which can occur when converting `Wtf8Buf` to `String`.
557#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
558pub struct IntoStringError {
559    pub wtf8: Wtf8Buf,
560    valid_up_to: usize,
561}
562impl IntoStringError {
563    /// Returns the index in the given string up to which valid UTF-8 was
564    /// verified.
565    ///
566    /// It is the maximum index such that `wstr[..index].to_str()` would
567    /// return `Ok(_)`.
568    #[inline]
569    pub fn valid_up_to(&self) -> usize {
570        self.valid_up_to
571    }
572
573    /// The length provided is that of the invalid byte sequence
574    /// that starts at the index given by `valid_up_to()`.
575    /// Decoding should resume after that sequence
576    /// (after inserting a [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]) in case
577    /// of lossy decoding.
578    ///
579    /// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html
580    #[inline]
581    pub fn error_len(&self) -> usize {
582        3
583    }
584}
585impl fmt::Display for IntoStringError {
586    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
587        write!(
588            f,
589            "invalid utf-8 sequence of 3 bytes from index {}",
590            self.valid_up_to
591        )
592    }
593}