pattern_3/omgwtf8/
wtf8.rs

1// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Implementation of [the WTF-8](https://simonsapin.github.io/wtf-8/) and
12//! [OMG-WTF-8](https://github.com/kennytm/omgwtf8) encodings.
13//!
14//! This library uses Rust’s type system to maintain
15//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
16//! like the `String` and `&str` types do for UTF-8.
17//!
18//! Since [WTF-8 must not be used
19//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
20//! this library deliberately does not provide access to the underlying bytes
21//! of WTF-8 strings,
22//! nor can it decode WTF-8 from arbitrary bytes.
23//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
24
25// this module is imported from @SimonSapin's repo and has tons of dead code on
26// unix (it's mostly used on windows), so don't worry about dead code here.
27#![allow(dead_code)]
28
29use std::fmt;
30use std::hash::{Hash, Hasher};
31use std::mem;
32use std::ops;
33use std::marker::PhantomData;
34use std::str;
35use std::num::NonZeroU16;
36use std::cmp;
37use std::slice;
38
39const UTF8_REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}";
40
41/// Represents a high surrogate code point.
42///
43/// Internally, the value is the last 2 bytes of the surrogate in its canonical
44/// (WTF-8) representation, e.g. U+D800 is `ed a0 80` in WTF-8, so the value
45/// stored here would be `0xa080`. This also means the valid range of this type
46/// must be `0xa080..=0xafbf`.
47#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
48pub(super) struct HighSurrogate(NonZeroU16);
49impl HighSurrogate {
50    #[cfg(test)]
51    pub(super) fn from_code_point_unchecked(cp: u16) -> Self {
52        let encoded = cp & 0x3f | (cp << 2) & 0xf00 | 0xa080;
53        unsafe { HighSurrogate(NonZeroU16::new_unchecked(encoded)) }
54    }
55
56    fn decode(self) -> [u8; 3] {
57        let c = self.0.get();
58        [0xed, (c >> 8) as u8, c as u8]
59    }
60
61    pub(super) fn value(self) -> u16 {
62        self.0.get()
63    }
64}
65
66/// Represents a low surrogate code point.
67///
68/// Internally, the value is the last 2 bytes of the surrogate in its canonical
69/// (WTF-8) representation, e.g. U+DC00 is `ed b0 80` in WTF-8, so the value
70/// stored here would be `0xb080`. This also means the valid range of this type
71/// must be `0xb080..=0xbfbf`.
72#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
73pub(super) struct LowSurrogate(NonZeroU16);
74impl LowSurrogate {
75    #[cfg(test)]
76    pub(super) fn from_code_point_unchecked(cp: u16) -> Self {
77        let encoded = cp & 0x3f | (cp << 2) & 0xf00 | 0xb080;
78        unsafe { LowSurrogate(NonZeroU16::new_unchecked(encoded)) }
79    }
80
81    fn decode(self) -> [u8; 3] {
82        let c = self.0.get();
83        [0xed, (c >> 8) as u8, c as u8]
84    }
85
86    pub(super) fn value(self) -> u16 {
87        self.0.get()
88    }
89}
90
91fn decode_surrogate_pair(high: HighSurrogate, low: LowSurrogate) -> [u8; 4] {
92    // we want to transform the bits from:
93    //
94    //      high surrogate'   low surrogate
95    //      101wvuts 10rqpnmk 1011jihg 10fedcba
96    // to
97    //      UTF-8
98    //      11110wvu 10tsrqpn 10mkjihg 10fedcba
99    // ...
100
101    //       lo & 0xfff = 00000000 00000000 0000jihg 10fedbca
102    //
103    //         hi << 12 = 0000101w vuts10rq pnmk0000 00000000
104    //   ... & 0x303000 = 00000000 00ts0000 00mk0000 00000000
105    //
106    //         hi << 14 = 00101wvu ts10rqpn mk000000 00000000
107    //  ... & 0x70f0000 = 00000wvu 0000rqpn 00000000 00000000
108    //
109    //       0xf0808000 = 11110000 10000000 10000000 00000000
110    //
111    //        ... | ... = 11110wvu 10tsrqpn 10mkjihg 10fedcba
112    let lo = low.0.get() as u32;
113    let hi = (high.0.get() as u32) + 0x100;
114    let combined = (lo & 0xfff) | (hi << 12 & 0x303000) | (hi << 14 & 0x70f0000) | 0xf0808000;
115    combined.to_be_bytes()
116}
117
118#[test]
119fn test_decode_surrogate_pair() {
120    fn consume(hi: u16, lo: u16, utf8: [u8; 4]) {
121        let high = HighSurrogate(NonZeroU16::new(hi).unwrap());
122        let low = LowSurrogate(NonZeroU16::new(lo).unwrap());
123        assert_eq!(decode_surrogate_pair(high, low), utf8);
124    }
125    consume(0xa080, 0xb080, [0xf0, 0x90, 0x80, 0x80]);
126    consume(0xa0bd, 0xb88d, [0xf0, 0x9f, 0x98, 0x8d]);
127    consume(0xafbf, 0xbfbf, [0xf4, 0x8f, 0xbf, 0xbf]);
128}
129
130
131/// Represents a 3-byte sequence as part of a well-formed OMG-WTF-8 sequence.
132///
133/// Internally, the sequence is encoded as a big-endian integer to simplify
134/// computation (not using native endian here since there's no advantage in
135/// reading *3* bytes).
136#[derive(Copy, Clone)]
137pub(super) struct ThreeByteSeq(u32);
138impl ThreeByteSeq {
139    fn to_high_surrogate_from_split_repr_unchecked(self) -> u16 {
140        // the high surrogate in split representation has bit pattern
141        //
142        //  self.0 =        ******** 11110kji 10hgfedc 10ba****
143        //
144        // thus:
145        //  self.0 >> 4 =   0000**** ****1111 0kji10hg fedc10ba
146        //        0x303 =   00000000 00000000 00000011 00000011
147        //            & =   00000000 00000000 000000hg 000000ba
148        //
149        //  self.0 >> 6 =   000000** ******11 110kji10 hgfedc10
150        //       0x3c3c =   00000000 00000000 00111100 00111100
151        //            & =   00000000 00000000 000kji00 00fedc00
152        //
153        //    ... | ... =   00000000 00000000 000kjihg 00fedcba
154        //
155        // The -0x100 is to account for the UTF-16 offset. The final
156        // 0xa080 is to make the final bit patterns compare the same as
157        // the canonical representation.
158        //
159        (((self.0 >> 4 & 0x303 | self.0 >> 6 & 0x3c3c) - 0x100) | 0xa080) as u16
160    }
161
162    /// Obtains the high surrogate value from this 3-byte sequence.
163    ///
164    /// If the input is not a high surrogate, returns None.
165    fn to_high_surrogate(self) -> Option<HighSurrogate> {
166        let surrogate_value = match self.0 {
167            // canonical representation
168            0xeda000..=0xedafff => self.0 as u16,
169            // split representation
170            0xf00000..=0xffffffff => self.to_high_surrogate_from_split_repr_unchecked(),
171            _ => 0,
172        };
173        NonZeroU16::new(surrogate_value).map(HighSurrogate)
174    }
175
176    /// Obtains the low surrogate value from this 3-byte sequence.
177    ///
178    /// If the input is not a low surrogate, returns None.
179    fn to_low_surrogate(self) -> Option<LowSurrogate> {
180        let surrogate_value = match self.0 {
181            // canonical representation
182            0xedb000..=0xedffff => self.0,
183            // split representation
184            0x800000..=0xbfffff => self.0 | 0xb000,
185            _ => 0,
186        };
187        NonZeroU16::new(surrogate_value as u16).map(LowSurrogate)
188    }
189
190    /// Extracts a WTF-16 code unit from the 3-byte sequence.
191    fn as_code_unit(self) -> u16 {
192        (match self.0 {
193            0xf00000...0xffffffff => {
194                (self.0 >> 4 & 3 | self.0 >> 6 & 0xfc | self.0 >> 8 & 0x700) + 0xd7c0
195            }
196            0x800000...0xbfffff => self.0 & 0x3f | self.0 >> 2 & 0x3c0 | 0xdc00,
197            _ => self.0 & 0x3f | self.0 >> 2 & 0xfc0 | self.0 >> 4 & 0xf000,
198        }) as u16
199    }
200
201    /// Constructs a 3-byte sequence from the bytes.
202    pub(super) fn new(input: &[u8]) -> Self {
203        assert!(input.len() >= 3);
204        ThreeByteSeq((input[0] as u32) << 16 | (input[1] as u32) << 8 | (input[2] as u32))
205    }
206
207    pub(super) fn value(self) -> u32 {
208        self.0
209    }
210}
211
212/// A borrowed slice of well-formed WTF-8 data.
213///
214/// Similar to `&str`, but can additionally contain surrogate code points
215/// if they’re not in a surrogate pair.
216pub struct Wtf8 {
217    bytes: [u8]
218}
219
220impl Wtf8 {
221    pub(super) fn as_inner(&self) -> &[u8] { &self.bytes }
222}
223
224/// Format the slice with double quotes,
225/// and surrogates as `\u` followed by four hexadecimal digits.
226/// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
227impl fmt::Debug for Wtf8 {
228    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
229        fn write_str_escaped(f: &mut fmt::Formatter, s: &str) -> fmt::Result {
230            use std::fmt::Write;
231            for c in s.chars().flat_map(|c| c.escape_debug()) {
232                f.write_char(c)?
233            }
234            Ok(())
235        }
236
237        formatter.write_str("\"")?;
238        let mut pos = 0;
239        while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
240            write_str_escaped(
241                formatter,
242                unsafe { str::from_utf8_unchecked(
243                    &self.bytes[pos .. surrogate_pos]
244                )},
245            )?;
246            write!(formatter, "\\u{{{:x}}}", surrogate)?;
247            pos = surrogate_pos + 3;
248        }
249        write_str_escaped(
250            formatter,
251            unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) },
252        )?;
253        formatter.write_str("\"")
254    }
255}
256
257impl fmt::Display for Wtf8 {
258    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
259        let wtf8_bytes = &self.bytes;
260        let mut pos = 0;
261        loop {
262            match self.next_surrogate(pos) {
263                Some((surrogate_pos, _)) => {
264                    formatter.write_str(unsafe {
265                        str::from_utf8_unchecked(&wtf8_bytes[pos .. surrogate_pos])
266                    })?;
267                    formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
268                    pos = surrogate_pos + 3;
269                },
270                None => {
271                    let s = unsafe {
272                        str::from_utf8_unchecked(&wtf8_bytes[pos..])
273                    };
274                    if pos == 0 {
275                        return s.fmt(formatter)
276                    } else {
277                        return formatter.write_str(s)
278                    }
279                }
280            }
281        }
282    }
283}
284
285impl Wtf8 {
286    /// Creates a WTF-8 slice from a UTF-8 `&str` slice.
287    ///
288    /// Since WTF-8 is a superset of UTF-8, this always succeeds.
289    #[inline]
290    pub fn from_str(value: &str) -> &Wtf8 {
291        unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
292    }
293
294    /// Creates a WTF-8 slice from a WTF-8 byte slice.
295    ///
296    /// Since the byte slice is not checked for valid WTF-8, this functions is
297    /// marked unsafe.
298    #[inline]
299    pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
300        mem::transmute(value)
301    }
302
303    /// Returns the length, in WTF-8 bytes.
304    #[inline]
305    pub fn len(&self) -> usize {
306        self.bytes.len()
307    }
308
309    #[inline]
310    pub fn is_empty(&self) -> bool {
311        self.bytes.is_empty()
312    }
313
314    /// Returns the code point at `position` if it is in the ASCII range,
315    /// or `b'\xFF' otherwise.
316    ///
317    /// # Panics
318    ///
319    /// Panics if `position` is beyond the end of the string.
320    #[inline]
321    pub fn ascii_byte_at(&self, position: usize) -> u8 {
322        match self.bytes[position] {
323            ascii_byte @ 0x00 ... 0x7F => ascii_byte,
324            _ => 0xFF
325        }
326    }
327
328    /// Tries to convert the string to UTF-8 and return a `&str` slice.
329    ///
330    /// Returns `None` if the string contains surrogates.
331    ///
332    /// This does not copy the data.
333    #[inline]
334    pub fn as_str(&self) -> Option<&str> {
335        // Well-formed WTF-8 is also well-formed UTF-8
336        // if and only if it contains no surrogate.
337        match self.next_surrogate(0) {
338            None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
339            Some(_) => None,
340        }
341    }
342
343    /// Converts the WTF-8 string to potentially ill-formed UTF-16
344    /// and return an iterator of 16-bit code units.
345    ///
346    /// This is lossless:
347    /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
348    /// would always return the original WTF-8 string.
349    #[inline]
350    pub fn encode_wide(&self) -> EncodeWide {
351        let ptr = self.bytes.as_ptr();
352        let end = unsafe { ptr.add(self.bytes.len()) };
353        EncodeWide { ptr, end, _marker: PhantomData }
354    }
355
356    #[inline]
357    fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
358        loop {
359            let inc = match *self.bytes.get(pos)? {
360                0..=0x7f => 1,
361                0x80..=0xbf => break,
362                0xc0..=0xdf => 2,
363                b @ 0xe0..=0xef => if b == 0xed && self.bytes[pos + 1] >= 0xa0 { break } else { 3 },
364                0xf0..=0xff => if self.len() == pos + 3 { break } else { 4 },
365                _ => unreachable!(),
366            };
367            pos += inc;
368        }
369        Some((pos, ThreeByteSeq::new(&self.bytes[pos..]).as_code_unit()))
370    }
371
372    /// Splits-off the first low surrogate from the string.
373    fn split_off_first_low_surrogate(self: &mut &Self) -> Option<LowSurrogate> {
374        let input = self.bytes.get(..3)?;
375        let res = ThreeByteSeq::new(input).to_low_surrogate()?;
376        *self = unsafe { Self::from_bytes_unchecked(&self.bytes[3..]) };
377        Some(res)
378    }
379
380    /// Splits-off the last high surrogate from the string.
381    fn split_off_last_high_surrogate(self: &mut &Self) -> Option<HighSurrogate> {
382        let e = self.len().checked_sub(3)?;
383        let res = ThreeByteSeq::new(&self.bytes[e..]).to_high_surrogate()?;
384        *self = unsafe { Self::from_bytes_unchecked(&self.bytes[..e]) };
385        Some(res)
386    }
387
388    /// Split the string into three parts: the beginning low surrogate, the
389    /// well-formed WTF-8 string in the middle, and the ending high surrogate.
390    pub(super) fn canonicalize(&self) -> (Option<LowSurrogate>, &[u8], Option<HighSurrogate>) {
391        let mut s = self;
392        let low = s.split_off_first_low_surrogate();
393        let high = s.split_off_last_high_surrogate();
394        (low, &s.bytes, high)
395    }
396
397    fn canonicalize_in_place(bytes: &mut [u8]) {
398        let len = bytes.len();
399        if len < 3 {
400            return;
401        }
402        // first 3 bytes form a low surrogate
403        // (this check is a faster version of `(0x80..0xc0).contains(_)`).
404        if (bytes[0] as i8) < -0x40 {
405            bytes[0] = 0xed;
406            bytes[1] |= 0x30;
407        }
408        // last 3 bytes form a high surrogate
409        if bytes[len - 3] >= 0xf0 {
410            let cu = ThreeByteSeq::new(&bytes[(len - 3)..]).to_high_surrogate_from_split_repr_unchecked();
411            bytes[len - 3] = 0xed;
412            bytes[len - 2] = (cu >> 8) as u8;
413            bytes[len - 1] = cu as u8;
414        }
415    }
416}
417
418// FIXME: Comparing Option<Surrogate> is not fully optimized yet #49892.
419
420impl PartialEq for Wtf8 {
421    fn eq(&self, other: &Self) -> bool {
422        self.canonicalize() == other.canonicalize()
423    }
424    fn ne(&self, other: &Self) -> bool {
425        self.canonicalize() != other.canonicalize()
426    }
427}
428impl Eq for Wtf8 {}
429
430impl PartialOrd for Wtf8 {
431    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
432        self.canonicalize().partial_cmp(&other.canonicalize())
433    }
434    fn lt(&self, other: &Self) -> bool {
435        self.canonicalize() < other.canonicalize()
436    }
437    fn le(&self, other: &Self) -> bool {
438        self.canonicalize() <= other.canonicalize()
439    }
440    fn gt(&self, other: &Self) -> bool {
441        self.canonicalize() > other.canonicalize()
442    }
443    fn ge(&self, other: &Self) -> bool {
444        self.canonicalize() >= other.canonicalize()
445    }
446}
447impl Ord for Wtf8 {
448    fn cmp(&self, other: &Self) -> cmp::Ordering {
449        self.canonicalize().cmp(&other.canonicalize())
450    }
451}
452
453/// Return a slice of the given string for the byte range [`begin`..`end`).
454///
455/// # Panics
456///
457/// Panics when `begin` and `end` do not point to code point boundaries,
458/// or point beyond the end of the string.
459impl ops::Index<ops::Range<usize>> for Wtf8 {
460    type Output = Wtf8;
461
462    #[inline]
463    fn index(&self, mut range: ops::Range<usize>) -> &Wtf8 {
464        if range.start == range.end {
465            return Self::from_str("");
466        }
467        match classify_index(self, range.start) {
468            IndexType::FourByteSeq2 => range.start -= 1,
469            IndexType::CharBoundary => {}
470            _ => slice_error_fail(self, range.start, range.end),
471        };
472        match classify_index(self, range.end) {
473            IndexType::FourByteSeq2 => range.end += 1,
474            IndexType::CharBoundary => {}
475            _ => slice_error_fail(self, range.start, range.end),
476        };
477        unsafe { slice_unchecked(self, range.start, range.end) }
478    }
479}
480
481/// Return a slice of the given string from byte `begin` to its end.
482///
483/// # Panics
484///
485/// Panics when `begin` is not at a code point boundary,
486/// or is beyond the end of the string.
487impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
488    type Output = Wtf8;
489
490    #[inline]
491    fn index(&self, mut range: ops::RangeFrom<usize>) -> &Wtf8 {
492        match classify_index(self, range.start) {
493            IndexType::FourByteSeq2 => range.start -= 1,
494            IndexType::CharBoundary => {}
495            _ => slice_error_fail(self, range.start, self.len()),
496        };
497        unsafe { slice_unchecked(self, range.start, self.len()) }
498    }
499}
500
501/// Return a slice of the given string from its beginning to byte `end`.
502///
503/// # Panics
504///
505/// Panics when `end` is not at a code point boundary,
506/// or is beyond the end of the string.
507impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
508    type Output = Wtf8;
509
510    #[inline]
511    fn index(&self, mut range: ops::RangeTo<usize>) -> &Wtf8 {
512        match classify_index(self, range.end) {
513            IndexType::FourByteSeq2 => range.end += 1,
514            IndexType::CharBoundary => {}
515            _ => slice_error_fail(self, 0, range.end),
516        };
517            unsafe { slice_unchecked(self, 0, range.end) }
518    }
519}
520
521impl ops::Index<ops::RangeFull> for Wtf8 {
522    type Output = Wtf8;
523
524    #[inline]
525    fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
526        self
527    }
528}
529
530/// Type of an index in an OMG-WTF-8 string.
531#[derive(Copy, Clone, PartialEq, Eq, Debug)]
532#[repr(u8)]
533enum IndexType {
534    /// Boundary of a WTF-8 character sequence.
535    CharBoundary = 0,
536    /// Byte 1 in a 4-byte sequence.
537    FourByteSeq1 = 1,
538    /// Byte 2 in a 4-byte sequence.
539    FourByteSeq2 = 2,
540    /// Byte 3 in a 4-byte sequence.
541    FourByteSeq3 = 3,
542    /// Pointing inside a 2- or 3-byte sequence.
543    Interior = 4,
544    /// Out of bounds.
545    OutOfBounds = 5,
546}
547
548/// Classifies the kind of index in this string.
549fn classify_index(slice: &Wtf8, index: usize) -> IndexType {
550    let slice = &slice.bytes;
551    let len = slice.len();
552    if index == 0 || index == len {
553        return IndexType::CharBoundary;
554    }
555    match slice.get(index) {
556        Some(0x80..=0xbf) => {
557            let max_offset = index.min(3);
558            let min_offset = (index + 3).saturating_sub(len);
559            for offset in min_offset..max_offset {
560                let offset = offset + 1;
561                unsafe {
562                    if slice.get_unchecked(index - offset) >= &0xf0 {
563                        return mem::transmute(offset as u8);
564                    }
565                }
566            }
567            IndexType::Interior
568        }
569        Some(_) => IndexType::CharBoundary,
570        None => IndexType::OutOfBounds,
571    }
572}
573
574/// Copied from core::str::raw::slice_unchecked
575#[inline]
576pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
577    // memory layout of an &[u8] and &Wtf8 are the same
578    assert!(begin <= end);
579    Wtf8::from_bytes_unchecked(s.bytes.get_unchecked(begin..end))
580}
581
582/// Copied from core::str::raw::slice_error_fail
583#[inline(never)]
584pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
585    assert!(begin <= end);
586    panic!("index {} and/or {} in `{:?}` do not lie on character boundary",
587          begin, end, s);
588}
589
590/// Generates a wide character sequence for potentially ill-formed UTF-16.
591#[derive(Clone)]
592pub struct EncodeWide<'a> {
593    ptr: *const u8,
594    end: *const u8,
595    _marker: PhantomData<&'a u8>,
596}
597
598#[inline]
599fn code_unit_from_two_byte_seq(c: u8, d: u8) -> u16 {
600    ((c as u16) & 0x1f) << 6 | ((d as u16) & 0x3f)
601}
602
603// Copied from libunicode/u_str.rs
604impl<'a> Iterator for EncodeWide<'a> {
605    type Item = u16;
606
607    #[inline]
608    fn next(&mut self) -> Option<u16> {
609        if self.ptr == self.end {
610            return None;
611        }
612
613        unsafe {
614            let c = *self.ptr;
615            match c {
616                0x00..=0x7f => {
617                    self.ptr = self.ptr.offset(1);
618                    Some(c as u16)
619                }
620                0x80..=0xbf | 0xe0..=0xff => {
621                    let tbs = ThreeByteSeq::new(slice::from_raw_parts(self.ptr, 3));
622                    let mut new_ptr = self.ptr.offset(3);
623                    if c >= 0xf0 && new_ptr != self.end {
624                        new_ptr = self.ptr.offset(1);
625                    }
626                    self.ptr = new_ptr;
627                    Some(tbs.as_code_unit())
628                }
629                0xc0..=0xdf => {
630                    let d = *self.ptr.offset(1);
631                    self.ptr = self.ptr.offset(2);
632                    Some(code_unit_from_two_byte_seq(c, d))
633                }
634                _ => unreachable!(),
635            }
636        }
637    }
638
639    #[inline]
640    fn size_hint(&self) -> (usize, Option<usize>) {
641        // converting from WTF-8 to WTF-16:
642        //  1-byte seq => 1 code unit (1x)
643        //  2-byte seq => 1 code unit (0.5x)
644        //  3-byte seq => 1 code unit (0.33x)
645        //  4-byte seq => 2 code units (0.5x)
646        //
647        // thus the lower-limit is everything being a 3-byte seq (= ceil(len/3))
648        // and upper-limit is everything being 1-byte seq (= len).
649        let len = unsafe { self.end.offset_from(self.ptr) as usize };
650        (len.saturating_add(2) / 3, Some(len))
651    }
652}
653
654impl<'a> DoubleEndedIterator for EncodeWide<'a> {
655    #[inline]
656    fn next_back(&mut self) -> Option<u16> {
657        if self.ptr == self.end {
658            return None;
659        }
660        unsafe {
661            let last = self.end.offset(-1);
662            let d = *last;
663            if d < 0x80 {
664                self.end = last;
665                return Some(d as u16);
666            }
667
668            let last_2 = self.end.offset(-2);
669            let c = *last_2;
670            if 0xc0 <= c && c < 0xe0 {
671                self.end = last_2;
672                return Some(code_unit_from_two_byte_seq(c, d));
673            }
674
675            let mut new_end = self.end.offset(-3);
676            let tbs = ThreeByteSeq::new(slice::from_raw_parts(new_end, 3));
677            if *new_end < 0xc0 && self.ptr != new_end {
678                new_end = last;
679            }
680            self.end = new_end;
681            Some(tbs.as_code_unit())
682        }
683    }
684}
685
686impl Hash for Wtf8 {
687    #[inline]
688    fn hash<H: Hasher>(&self, state: &mut H) {
689        let (left, middle, right) = self.canonicalize();
690        if let Some(low) = left {
691            state.write(&low.decode());
692        }
693        state.write(middle);
694        if let Some(high) = right {
695            state.write(&high.decode());
696        }
697        0xfeu8.hash(state)
698    }
699}
700
701impl Wtf8 {
702    pub fn make_ascii_uppercase(&mut self) { self.bytes.make_ascii_uppercase() }
703}
704
705#[cfg(test)]
706mod tests {
707    use super::*;
708
709    #[test]
710    fn wtf8_from_str() {
711        assert_eq!(&Wtf8::from_str("").bytes, b"");
712        assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
713    }
714
715    #[test]
716    fn wtf8_len() {
717        assert_eq!(Wtf8::from_str("").len(), 0);
718        assert_eq!(Wtf8::from_str("aé 💩").len(), 8);
719    }
720
721    #[test]
722    fn wtf8_slice() {
723        assert_eq!(&Wtf8::from_str("aé 💩")[1.. 4].bytes, b"\xC3\xA9 ");
724    }
725
726    #[test]
727    fn omgwtf8_slice() {
728        let s = Wtf8::from_str("😀😂😄");
729        assert_eq!(&s[..].bytes, b"\xf0\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98\x84");
730        assert_eq!(&s[2..].bytes, b"\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98\x84");
731        assert_eq!(&s[4..].bytes, b"\xf0\x9f\x98\x82\xf0\x9f\x98\x84");
732        assert_eq!(&s[..10].bytes, b"\xf0\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98");
733        assert_eq!(&s[..8].bytes, b"\xf0\x9f\x98\x80\xf0\x9f\x98\x82");
734        assert_eq!(&s[2..10].bytes, b"\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98");
735        assert_eq!(&s[4..8].bytes, b"\xf0\x9f\x98\x82");
736        assert_eq!(&s[2..4].bytes, b"\x9f\x98\x80");
737        assert_eq!(&s[2..2].bytes, b"");
738        assert_eq!(&s[0..2].bytes, b"\xf0\x9f\x98");
739        assert_eq!(&s[4..4].bytes, b"");
740    }
741
742    #[test]
743    #[should_panic]
744    fn wtf8_slice_not_code_point_boundary() {
745        &Wtf8::from_str("aé 💩")[2.. 4];
746    }
747
748    #[test]
749    fn wtf8_slice_from() {
750        assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9");
751    }
752
753    #[test]
754    #[should_panic]
755    fn wtf8_slice_from_not_code_point_boundary() {
756        &Wtf8::from_str("aé 💩")[2..];
757    }
758
759    #[test]
760    fn wtf8_slice_to() {
761        assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 ");
762    }
763
764    #[test]
765    #[should_panic]
766    fn wtf8_slice_to_not_code_point_boundary() {
767        &Wtf8::from_str("aé 💩")[5..];
768    }
769
770    #[test]
771    #[should_panic]
772    fn test_slice_into_invalid_index_split_begin_1() {
773        let s = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x80\x80\x7e") };
774        let _ = s[..1];
775    }
776    #[test]
777    #[should_panic]
778    fn test_slice_into_invalid_index_split_begin_2() {
779        let s = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x80\x80\x7e") };
780        let _ = s[..2];
781    }
782    #[test]
783    #[should_panic]
784    fn test_slice_into_invalid_index_split_end_1() {
785        let s = unsafe { Wtf8::from_bytes_unchecked(b"\x7e\xf0\x90\x80") };
786        let _ = s[2..];
787    }
788    #[test]
789    #[should_panic]
790    fn test_slice_into_invalid_index_split_end_2() {
791        let s = unsafe { Wtf8::from_bytes_unchecked(b"\x7e\xf0\x90\x80") };
792        let _ = s[3..];
793    }
794    #[test]
795    #[should_panic]
796    fn test_slice_into_invalid_index_canonical_1() {
797        let s = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xaf\xbf") };
798        let _ = s[1..];
799    }
800    #[test]
801    #[should_panic]
802    fn test_slice_into_invalid_index_canonical_2() {
803        let s = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xaf\xbf") };
804        let _ = s[2..];
805    }
806    #[test]
807    #[should_panic]
808    fn test_slice_into_invalid_index_wrong_order() {
809        let s = Wtf8::from_str("12345");
810        let _ = s[3..1];
811    }
812
813    #[test]
814    fn wtf8_ascii_byte_at() {
815        let slice = Wtf8::from_str("aé 💩");
816        assert_eq!(slice.ascii_byte_at(0), b'a');
817        assert_eq!(slice.ascii_byte_at(1), b'\xFF');
818        assert_eq!(slice.ascii_byte_at(2), b'\xFF');
819        assert_eq!(slice.ascii_byte_at(3), b' ');
820        assert_eq!(slice.ascii_byte_at(4), b'\xFF');
821    }
822
823    macro_rules! check_encode_wide {
824        ($s:expr, $cu:expr) => {
825            let mut v = $cu;
826            assert_eq!($s.encode_wide().collect::<Vec<_>>(), v);
827            v.reverse();
828            assert_eq!($s.encode_wide().rev().collect::<Vec<_>>(), v);
829        }
830    }
831
832    #[test]
833    #[cfg(feature = "std")]
834    fn wtf8_encode_wide() {
835        let string = unsafe { Wtf8::from_bytes_unchecked(b"a\xc3\xa9 \xed\xa0\xbd\xf0\x9f\x92\xa9") };
836        check_encode_wide!(string, vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]);
837    }
838
839    #[test]
840    #[cfg(feature = "std")]
841    fn omgwtf8_encode_wide() {
842        let s = Wtf8::from_str("😀😂😄");
843        check_encode_wide!(s, vec![0xd83d, 0xde00, 0xd83d, 0xde02, 0xd83d, 0xde04]);
844        check_encode_wide!(s[2..], vec![0xde00, 0xd83d, 0xde02, 0xd83d, 0xde04]);
845        check_encode_wide!(s[..10], vec![0xd83d, 0xde00, 0xd83d, 0xde02, 0xd83d]);
846    }
847
848    #[test]
849    #[cfg(feature = "std")]
850    fn omgwtf8_eq_hash() {
851        use std::collections::hash_map::DefaultHasher;
852
853        let a = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x8b\xae~\xf0\x90\x80") };
854        let b = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xbb\xae~\xf0\x90\x80") };
855        let c = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x8b\xae~\xed\xa0\x80") };
856        let d = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xbb\xae~\xed\xa0\x80") };
857
858        assert_eq!(a, b);
859        assert_eq!(b, c);
860        assert_eq!(c, d);
861
862        fn hash<H: Hash>(a: H) -> u64 {
863            let mut h = DefaultHasher::new();
864            a.hash(&mut h);
865            h.finish()
866        }
867
868        assert_eq!(hash(a), hash(b));
869        assert_eq!(hash(b), hash(c));
870        assert_eq!(hash(c), hash(d));
871    }
872
873    #[test]
874    #[cfg(feature = "std")]
875    fn omgwtf8_classify_index() {
876        use super::IndexType::*;
877
878        fn consume(input: &Wtf8, expected: &[IndexType]) {
879            let actual = (0..expected.len()).map(|i| classify_index(input, i)).collect::<Vec<_>>();
880            assert_eq!(&*actual, expected);
881        }
882        consume(
883            Wtf8::from_str(""),
884            &[CharBoundary, OutOfBounds, OutOfBounds],
885        );
886        consume(
887            Wtf8::from_str("aa"),
888            &[CharBoundary, CharBoundary, CharBoundary, OutOfBounds],
889        );
890        consume(
891            Wtf8::from_str("á"),
892            &[CharBoundary, Interior, CharBoundary, OutOfBounds],
893        );
894        consume(
895            Wtf8::from_str("\u{3000}"),
896            &[CharBoundary, Interior, Interior, CharBoundary, OutOfBounds],
897        );
898        consume(
899            Wtf8::from_str("\u{30000}"),
900            &[CharBoundary, FourByteSeq1, FourByteSeq2, FourByteSeq3, CharBoundary, OutOfBounds],
901        );
902        consume(
903            unsafe { Wtf8::from_bytes_unchecked(b"\xed\xbf\xbf\xed\xa0\x80") },
904            &[
905                CharBoundary, Interior, Interior,
906                CharBoundary, Interior, Interior,
907                CharBoundary, OutOfBounds,
908            ],
909        );
910        consume(
911            unsafe { Wtf8::from_bytes_unchecked(b"\x90\x80\x80\xf0\x90\x80\x80\xf0\x90\x80") },
912            &[
913                CharBoundary, Interior, Interior,
914                CharBoundary, FourByteSeq1, FourByteSeq2, FourByteSeq3,
915                CharBoundary, Interior, Interior,
916                CharBoundary, OutOfBounds,
917            ],
918        );
919    }
920}
pattern_3/omgwtf8/wtf8.rs

pattern_3/omgwtf8/
wtf8.rs