1#![allow(dead_code)]
28
29use std::fmt;
30use std::hash::{Hash, Hasher};
31use std::mem;
32use std::ops;
33use std::marker::PhantomData;
34use std::str;
35use std::num::NonZeroU16;
36use std::cmp;
37use std::slice;
38
39const UTF8_REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}";
40
41#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
48pub(super) struct HighSurrogate(NonZeroU16);
49impl HighSurrogate {
50 #[cfg(test)]
51 pub(super) fn from_code_point_unchecked(cp: u16) -> Self {
52 let encoded = cp & 0x3f | (cp << 2) & 0xf00 | 0xa080;
53 unsafe { HighSurrogate(NonZeroU16::new_unchecked(encoded)) }
54 }
55
56 fn decode(self) -> [u8; 3] {
57 let c = self.0.get();
58 [0xed, (c >> 8) as u8, c as u8]
59 }
60
61 pub(super) fn value(self) -> u16 {
62 self.0.get()
63 }
64}
65
66#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
73pub(super) struct LowSurrogate(NonZeroU16);
74impl LowSurrogate {
75 #[cfg(test)]
76 pub(super) fn from_code_point_unchecked(cp: u16) -> Self {
77 let encoded = cp & 0x3f | (cp << 2) & 0xf00 | 0xb080;
78 unsafe { LowSurrogate(NonZeroU16::new_unchecked(encoded)) }
79 }
80
81 fn decode(self) -> [u8; 3] {
82 let c = self.0.get();
83 [0xed, (c >> 8) as u8, c as u8]
84 }
85
86 pub(super) fn value(self) -> u16 {
87 self.0.get()
88 }
89}
90
91fn decode_surrogate_pair(high: HighSurrogate, low: LowSurrogate) -> [u8; 4] {
92 let lo = low.0.get() as u32;
113 let hi = (high.0.get() as u32) + 0x100;
114 let combined = (lo & 0xfff) | (hi << 12 & 0x303000) | (hi << 14 & 0x70f0000) | 0xf0808000;
115 combined.to_be_bytes()
116}
117
118#[test]
119fn test_decode_surrogate_pair() {
120 fn consume(hi: u16, lo: u16, utf8: [u8; 4]) {
121 let high = HighSurrogate(NonZeroU16::new(hi).unwrap());
122 let low = LowSurrogate(NonZeroU16::new(lo).unwrap());
123 assert_eq!(decode_surrogate_pair(high, low), utf8);
124 }
125 consume(0xa080, 0xb080, [0xf0, 0x90, 0x80, 0x80]);
126 consume(0xa0bd, 0xb88d, [0xf0, 0x9f, 0x98, 0x8d]);
127 consume(0xafbf, 0xbfbf, [0xf4, 0x8f, 0xbf, 0xbf]);
128}
129
130
131#[derive(Copy, Clone)]
137pub(super) struct ThreeByteSeq(u32);
138impl ThreeByteSeq {
139 fn to_high_surrogate_from_split_repr_unchecked(self) -> u16 {
140 (((self.0 >> 4 & 0x303 | self.0 >> 6 & 0x3c3c) - 0x100) | 0xa080) as u16
160 }
161
162 fn to_high_surrogate(self) -> Option<HighSurrogate> {
166 let surrogate_value = match self.0 {
167 0xeda000..=0xedafff => self.0 as u16,
169 0xf00000..=0xffffffff => self.to_high_surrogate_from_split_repr_unchecked(),
171 _ => 0,
172 };
173 NonZeroU16::new(surrogate_value).map(HighSurrogate)
174 }
175
176 fn to_low_surrogate(self) -> Option<LowSurrogate> {
180 let surrogate_value = match self.0 {
181 0xedb000..=0xedffff => self.0,
183 0x800000..=0xbfffff => self.0 | 0xb000,
185 _ => 0,
186 };
187 NonZeroU16::new(surrogate_value as u16).map(LowSurrogate)
188 }
189
190 fn as_code_unit(self) -> u16 {
192 (match self.0 {
193 0xf00000...0xffffffff => {
194 (self.0 >> 4 & 3 | self.0 >> 6 & 0xfc | self.0 >> 8 & 0x700) + 0xd7c0
195 }
196 0x800000...0xbfffff => self.0 & 0x3f | self.0 >> 2 & 0x3c0 | 0xdc00,
197 _ => self.0 & 0x3f | self.0 >> 2 & 0xfc0 | self.0 >> 4 & 0xf000,
198 }) as u16
199 }
200
201 pub(super) fn new(input: &[u8]) -> Self {
203 assert!(input.len() >= 3);
204 ThreeByteSeq((input[0] as u32) << 16 | (input[1] as u32) << 8 | (input[2] as u32))
205 }
206
207 pub(super) fn value(self) -> u32 {
208 self.0
209 }
210}
211
212pub struct Wtf8 {
217 bytes: [u8]
218}
219
220impl Wtf8 {
221 pub(super) fn as_inner(&self) -> &[u8] { &self.bytes }
222}
223
224impl fmt::Debug for Wtf8 {
228 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
229 fn write_str_escaped(f: &mut fmt::Formatter, s: &str) -> fmt::Result {
230 use std::fmt::Write;
231 for c in s.chars().flat_map(|c| c.escape_debug()) {
232 f.write_char(c)?
233 }
234 Ok(())
235 }
236
237 formatter.write_str("\"")?;
238 let mut pos = 0;
239 while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
240 write_str_escaped(
241 formatter,
242 unsafe { str::from_utf8_unchecked(
243 &self.bytes[pos .. surrogate_pos]
244 )},
245 )?;
246 write!(formatter, "\\u{{{:x}}}", surrogate)?;
247 pos = surrogate_pos + 3;
248 }
249 write_str_escaped(
250 formatter,
251 unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) },
252 )?;
253 formatter.write_str("\"")
254 }
255}
256
257impl fmt::Display for Wtf8 {
258 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
259 let wtf8_bytes = &self.bytes;
260 let mut pos = 0;
261 loop {
262 match self.next_surrogate(pos) {
263 Some((surrogate_pos, _)) => {
264 formatter.write_str(unsafe {
265 str::from_utf8_unchecked(&wtf8_bytes[pos .. surrogate_pos])
266 })?;
267 formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
268 pos = surrogate_pos + 3;
269 },
270 None => {
271 let s = unsafe {
272 str::from_utf8_unchecked(&wtf8_bytes[pos..])
273 };
274 if pos == 0 {
275 return s.fmt(formatter)
276 } else {
277 return formatter.write_str(s)
278 }
279 }
280 }
281 }
282 }
283}
284
285impl Wtf8 {
286 #[inline]
290 pub fn from_str(value: &str) -> &Wtf8 {
291 unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
292 }
293
294 #[inline]
299 pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
300 mem::transmute(value)
301 }
302
303 #[inline]
305 pub fn len(&self) -> usize {
306 self.bytes.len()
307 }
308
309 #[inline]
310 pub fn is_empty(&self) -> bool {
311 self.bytes.is_empty()
312 }
313
314 #[inline]
321 pub fn ascii_byte_at(&self, position: usize) -> u8 {
322 match self.bytes[position] {
323 ascii_byte @ 0x00 ... 0x7F => ascii_byte,
324 _ => 0xFF
325 }
326 }
327
328 #[inline]
334 pub fn as_str(&self) -> Option<&str> {
335 match self.next_surrogate(0) {
338 None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
339 Some(_) => None,
340 }
341 }
342
343 #[inline]
350 pub fn encode_wide(&self) -> EncodeWide {
351 let ptr = self.bytes.as_ptr();
352 let end = unsafe { ptr.add(self.bytes.len()) };
353 EncodeWide { ptr, end, _marker: PhantomData }
354 }
355
356 #[inline]
357 fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
358 loop {
359 let inc = match *self.bytes.get(pos)? {
360 0..=0x7f => 1,
361 0x80..=0xbf => break,
362 0xc0..=0xdf => 2,
363 b @ 0xe0..=0xef => if b == 0xed && self.bytes[pos + 1] >= 0xa0 { break } else { 3 },
364 0xf0..=0xff => if self.len() == pos + 3 { break } else { 4 },
365 _ => unreachable!(),
366 };
367 pos += inc;
368 }
369 Some((pos, ThreeByteSeq::new(&self.bytes[pos..]).as_code_unit()))
370 }
371
372 fn split_off_first_low_surrogate(self: &mut &Self) -> Option<LowSurrogate> {
374 let input = self.bytes.get(..3)?;
375 let res = ThreeByteSeq::new(input).to_low_surrogate()?;
376 *self = unsafe { Self::from_bytes_unchecked(&self.bytes[3..]) };
377 Some(res)
378 }
379
380 fn split_off_last_high_surrogate(self: &mut &Self) -> Option<HighSurrogate> {
382 let e = self.len().checked_sub(3)?;
383 let res = ThreeByteSeq::new(&self.bytes[e..]).to_high_surrogate()?;
384 *self = unsafe { Self::from_bytes_unchecked(&self.bytes[..e]) };
385 Some(res)
386 }
387
388 pub(super) fn canonicalize(&self) -> (Option<LowSurrogate>, &[u8], Option<HighSurrogate>) {
391 let mut s = self;
392 let low = s.split_off_first_low_surrogate();
393 let high = s.split_off_last_high_surrogate();
394 (low, &s.bytes, high)
395 }
396
397 fn canonicalize_in_place(bytes: &mut [u8]) {
398 let len = bytes.len();
399 if len < 3 {
400 return;
401 }
402 if (bytes[0] as i8) < -0x40 {
405 bytes[0] = 0xed;
406 bytes[1] |= 0x30;
407 }
408 if bytes[len - 3] >= 0xf0 {
410 let cu = ThreeByteSeq::new(&bytes[(len - 3)..]).to_high_surrogate_from_split_repr_unchecked();
411 bytes[len - 3] = 0xed;
412 bytes[len - 2] = (cu >> 8) as u8;
413 bytes[len - 1] = cu as u8;
414 }
415 }
416}
417
418impl PartialEq for Wtf8 {
421 fn eq(&self, other: &Self) -> bool {
422 self.canonicalize() == other.canonicalize()
423 }
424 fn ne(&self, other: &Self) -> bool {
425 self.canonicalize() != other.canonicalize()
426 }
427}
428impl Eq for Wtf8 {}
429
430impl PartialOrd for Wtf8 {
431 fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
432 self.canonicalize().partial_cmp(&other.canonicalize())
433 }
434 fn lt(&self, other: &Self) -> bool {
435 self.canonicalize() < other.canonicalize()
436 }
437 fn le(&self, other: &Self) -> bool {
438 self.canonicalize() <= other.canonicalize()
439 }
440 fn gt(&self, other: &Self) -> bool {
441 self.canonicalize() > other.canonicalize()
442 }
443 fn ge(&self, other: &Self) -> bool {
444 self.canonicalize() >= other.canonicalize()
445 }
446}
447impl Ord for Wtf8 {
448 fn cmp(&self, other: &Self) -> cmp::Ordering {
449 self.canonicalize().cmp(&other.canonicalize())
450 }
451}
452
453impl ops::Index<ops::Range<usize>> for Wtf8 {
460 type Output = Wtf8;
461
462 #[inline]
463 fn index(&self, mut range: ops::Range<usize>) -> &Wtf8 {
464 if range.start == range.end {
465 return Self::from_str("");
466 }
467 match classify_index(self, range.start) {
468 IndexType::FourByteSeq2 => range.start -= 1,
469 IndexType::CharBoundary => {}
470 _ => slice_error_fail(self, range.start, range.end),
471 };
472 match classify_index(self, range.end) {
473 IndexType::FourByteSeq2 => range.end += 1,
474 IndexType::CharBoundary => {}
475 _ => slice_error_fail(self, range.start, range.end),
476 };
477 unsafe { slice_unchecked(self, range.start, range.end) }
478 }
479}
480
481impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
488 type Output = Wtf8;
489
490 #[inline]
491 fn index(&self, mut range: ops::RangeFrom<usize>) -> &Wtf8 {
492 match classify_index(self, range.start) {
493 IndexType::FourByteSeq2 => range.start -= 1,
494 IndexType::CharBoundary => {}
495 _ => slice_error_fail(self, range.start, self.len()),
496 };
497 unsafe { slice_unchecked(self, range.start, self.len()) }
498 }
499}
500
501impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
508 type Output = Wtf8;
509
510 #[inline]
511 fn index(&self, mut range: ops::RangeTo<usize>) -> &Wtf8 {
512 match classify_index(self, range.end) {
513 IndexType::FourByteSeq2 => range.end += 1,
514 IndexType::CharBoundary => {}
515 _ => slice_error_fail(self, 0, range.end),
516 };
517 unsafe { slice_unchecked(self, 0, range.end) }
518 }
519}
520
521impl ops::Index<ops::RangeFull> for Wtf8 {
522 type Output = Wtf8;
523
524 #[inline]
525 fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
526 self
527 }
528}
529
530#[derive(Copy, Clone, PartialEq, Eq, Debug)]
532#[repr(u8)]
533enum IndexType {
534 CharBoundary = 0,
536 FourByteSeq1 = 1,
538 FourByteSeq2 = 2,
540 FourByteSeq3 = 3,
542 Interior = 4,
544 OutOfBounds = 5,
546}
547
548fn classify_index(slice: &Wtf8, index: usize) -> IndexType {
550 let slice = &slice.bytes;
551 let len = slice.len();
552 if index == 0 || index == len {
553 return IndexType::CharBoundary;
554 }
555 match slice.get(index) {
556 Some(0x80..=0xbf) => {
557 let max_offset = index.min(3);
558 let min_offset = (index + 3).saturating_sub(len);
559 for offset in min_offset..max_offset {
560 let offset = offset + 1;
561 unsafe {
562 if slice.get_unchecked(index - offset) >= &0xf0 {
563 return mem::transmute(offset as u8);
564 }
565 }
566 }
567 IndexType::Interior
568 }
569 Some(_) => IndexType::CharBoundary,
570 None => IndexType::OutOfBounds,
571 }
572}
573
574#[inline]
576pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
577 assert!(begin <= end);
579 Wtf8::from_bytes_unchecked(s.bytes.get_unchecked(begin..end))
580}
581
582#[inline(never)]
584pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
585 assert!(begin <= end);
586 panic!("index {} and/or {} in `{:?}` do not lie on character boundary",
587 begin, end, s);
588}
589
590#[derive(Clone)]
592pub struct EncodeWide<'a> {
593 ptr: *const u8,
594 end: *const u8,
595 _marker: PhantomData<&'a u8>,
596}
597
598#[inline]
599fn code_unit_from_two_byte_seq(c: u8, d: u8) -> u16 {
600 ((c as u16) & 0x1f) << 6 | ((d as u16) & 0x3f)
601}
602
603impl<'a> Iterator for EncodeWide<'a> {
605 type Item = u16;
606
607 #[inline]
608 fn next(&mut self) -> Option<u16> {
609 if self.ptr == self.end {
610 return None;
611 }
612
613 unsafe {
614 let c = *self.ptr;
615 match c {
616 0x00..=0x7f => {
617 self.ptr = self.ptr.offset(1);
618 Some(c as u16)
619 }
620 0x80..=0xbf | 0xe0..=0xff => {
621 let tbs = ThreeByteSeq::new(slice::from_raw_parts(self.ptr, 3));
622 let mut new_ptr = self.ptr.offset(3);
623 if c >= 0xf0 && new_ptr != self.end {
624 new_ptr = self.ptr.offset(1);
625 }
626 self.ptr = new_ptr;
627 Some(tbs.as_code_unit())
628 }
629 0xc0..=0xdf => {
630 let d = *self.ptr.offset(1);
631 self.ptr = self.ptr.offset(2);
632 Some(code_unit_from_two_byte_seq(c, d))
633 }
634 _ => unreachable!(),
635 }
636 }
637 }
638
639 #[inline]
640 fn size_hint(&self) -> (usize, Option<usize>) {
641 let len = unsafe { self.end.offset_from(self.ptr) as usize };
650 (len.saturating_add(2) / 3, Some(len))
651 }
652}
653
654impl<'a> DoubleEndedIterator for EncodeWide<'a> {
655 #[inline]
656 fn next_back(&mut self) -> Option<u16> {
657 if self.ptr == self.end {
658 return None;
659 }
660 unsafe {
661 let last = self.end.offset(-1);
662 let d = *last;
663 if d < 0x80 {
664 self.end = last;
665 return Some(d as u16);
666 }
667
668 let last_2 = self.end.offset(-2);
669 let c = *last_2;
670 if 0xc0 <= c && c < 0xe0 {
671 self.end = last_2;
672 return Some(code_unit_from_two_byte_seq(c, d));
673 }
674
675 let mut new_end = self.end.offset(-3);
676 let tbs = ThreeByteSeq::new(slice::from_raw_parts(new_end, 3));
677 if *new_end < 0xc0 && self.ptr != new_end {
678 new_end = last;
679 }
680 self.end = new_end;
681 Some(tbs.as_code_unit())
682 }
683 }
684}
685
686impl Hash for Wtf8 {
687 #[inline]
688 fn hash<H: Hasher>(&self, state: &mut H) {
689 let (left, middle, right) = self.canonicalize();
690 if let Some(low) = left {
691 state.write(&low.decode());
692 }
693 state.write(middle);
694 if let Some(high) = right {
695 state.write(&high.decode());
696 }
697 0xfeu8.hash(state)
698 }
699}
700
701impl Wtf8 {
702 pub fn make_ascii_uppercase(&mut self) { self.bytes.make_ascii_uppercase() }
703}
704
705#[cfg(test)]
706mod tests {
707 use super::*;
708
709 #[test]
710 fn wtf8_from_str() {
711 assert_eq!(&Wtf8::from_str("").bytes, b"");
712 assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
713 }
714
715 #[test]
716 fn wtf8_len() {
717 assert_eq!(Wtf8::from_str("").len(), 0);
718 assert_eq!(Wtf8::from_str("aé 💩").len(), 8);
719 }
720
721 #[test]
722 fn wtf8_slice() {
723 assert_eq!(&Wtf8::from_str("aé 💩")[1.. 4].bytes, b"\xC3\xA9 ");
724 }
725
726 #[test]
727 fn omgwtf8_slice() {
728 let s = Wtf8::from_str("😀😂😄");
729 assert_eq!(&s[..].bytes, b"\xf0\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98\x84");
730 assert_eq!(&s[2..].bytes, b"\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98\x84");
731 assert_eq!(&s[4..].bytes, b"\xf0\x9f\x98\x82\xf0\x9f\x98\x84");
732 assert_eq!(&s[..10].bytes, b"\xf0\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98");
733 assert_eq!(&s[..8].bytes, b"\xf0\x9f\x98\x80\xf0\x9f\x98\x82");
734 assert_eq!(&s[2..10].bytes, b"\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98");
735 assert_eq!(&s[4..8].bytes, b"\xf0\x9f\x98\x82");
736 assert_eq!(&s[2..4].bytes, b"\x9f\x98\x80");
737 assert_eq!(&s[2..2].bytes, b"");
738 assert_eq!(&s[0..2].bytes, b"\xf0\x9f\x98");
739 assert_eq!(&s[4..4].bytes, b"");
740 }
741
742 #[test]
743 #[should_panic]
744 fn wtf8_slice_not_code_point_boundary() {
745 &Wtf8::from_str("aé 💩")[2.. 4];
746 }
747
748 #[test]
749 fn wtf8_slice_from() {
750 assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9");
751 }
752
753 #[test]
754 #[should_panic]
755 fn wtf8_slice_from_not_code_point_boundary() {
756 &Wtf8::from_str("aé 💩")[2..];
757 }
758
759 #[test]
760 fn wtf8_slice_to() {
761 assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 ");
762 }
763
764 #[test]
765 #[should_panic]
766 fn wtf8_slice_to_not_code_point_boundary() {
767 &Wtf8::from_str("aé 💩")[5..];
768 }
769
770 #[test]
771 #[should_panic]
772 fn test_slice_into_invalid_index_split_begin_1() {
773 let s = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x80\x80\x7e") };
774 let _ = s[..1];
775 }
776 #[test]
777 #[should_panic]
778 fn test_slice_into_invalid_index_split_begin_2() {
779 let s = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x80\x80\x7e") };
780 let _ = s[..2];
781 }
782 #[test]
783 #[should_panic]
784 fn test_slice_into_invalid_index_split_end_1() {
785 let s = unsafe { Wtf8::from_bytes_unchecked(b"\x7e\xf0\x90\x80") };
786 let _ = s[2..];
787 }
788 #[test]
789 #[should_panic]
790 fn test_slice_into_invalid_index_split_end_2() {
791 let s = unsafe { Wtf8::from_bytes_unchecked(b"\x7e\xf0\x90\x80") };
792 let _ = s[3..];
793 }
794 #[test]
795 #[should_panic]
796 fn test_slice_into_invalid_index_canonical_1() {
797 let s = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xaf\xbf") };
798 let _ = s[1..];
799 }
800 #[test]
801 #[should_panic]
802 fn test_slice_into_invalid_index_canonical_2() {
803 let s = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xaf\xbf") };
804 let _ = s[2..];
805 }
806 #[test]
807 #[should_panic]
808 fn test_slice_into_invalid_index_wrong_order() {
809 let s = Wtf8::from_str("12345");
810 let _ = s[3..1];
811 }
812
813 #[test]
814 fn wtf8_ascii_byte_at() {
815 let slice = Wtf8::from_str("aé 💩");
816 assert_eq!(slice.ascii_byte_at(0), b'a');
817 assert_eq!(slice.ascii_byte_at(1), b'\xFF');
818 assert_eq!(slice.ascii_byte_at(2), b'\xFF');
819 assert_eq!(slice.ascii_byte_at(3), b' ');
820 assert_eq!(slice.ascii_byte_at(4), b'\xFF');
821 }
822
823 macro_rules! check_encode_wide {
824 ($s:expr, $cu:expr) => {
825 let mut v = $cu;
826 assert_eq!($s.encode_wide().collect::<Vec<_>>(), v);
827 v.reverse();
828 assert_eq!($s.encode_wide().rev().collect::<Vec<_>>(), v);
829 }
830 }
831
832 #[test]
833 #[cfg(feature = "std")]
834 fn wtf8_encode_wide() {
835 let string = unsafe { Wtf8::from_bytes_unchecked(b"a\xc3\xa9 \xed\xa0\xbd\xf0\x9f\x92\xa9") };
836 check_encode_wide!(string, vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]);
837 }
838
839 #[test]
840 #[cfg(feature = "std")]
841 fn omgwtf8_encode_wide() {
842 let s = Wtf8::from_str("😀😂😄");
843 check_encode_wide!(s, vec![0xd83d, 0xde00, 0xd83d, 0xde02, 0xd83d, 0xde04]);
844 check_encode_wide!(s[2..], vec![0xde00, 0xd83d, 0xde02, 0xd83d, 0xde04]);
845 check_encode_wide!(s[..10], vec![0xd83d, 0xde00, 0xd83d, 0xde02, 0xd83d]);
846 }
847
848 #[test]
849 #[cfg(feature = "std")]
850 fn omgwtf8_eq_hash() {
851 use std::collections::hash_map::DefaultHasher;
852
853 let a = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x8b\xae~\xf0\x90\x80") };
854 let b = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xbb\xae~\xf0\x90\x80") };
855 let c = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x8b\xae~\xed\xa0\x80") };
856 let d = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xbb\xae~\xed\xa0\x80") };
857
858 assert_eq!(a, b);
859 assert_eq!(b, c);
860 assert_eq!(c, d);
861
862 fn hash<H: Hash>(a: H) -> u64 {
863 let mut h = DefaultHasher::new();
864 a.hash(&mut h);
865 h.finish()
866 }
867
868 assert_eq!(hash(a), hash(b));
869 assert_eq!(hash(b), hash(c));
870 assert_eq!(hash(c), hash(d));
871 }
872
873 #[test]
874 #[cfg(feature = "std")]
875 fn omgwtf8_classify_index() {
876 use super::IndexType::*;
877
878 fn consume(input: &Wtf8, expected: &[IndexType]) {
879 let actual = (0..expected.len()).map(|i| classify_index(input, i)).collect::<Vec<_>>();
880 assert_eq!(&*actual, expected);
881 }
882 consume(
883 Wtf8::from_str(""),
884 &[CharBoundary, OutOfBounds, OutOfBounds],
885 );
886 consume(
887 Wtf8::from_str("aa"),
888 &[CharBoundary, CharBoundary, CharBoundary, OutOfBounds],
889 );
890 consume(
891 Wtf8::from_str("á"),
892 &[CharBoundary, Interior, CharBoundary, OutOfBounds],
893 );
894 consume(
895 Wtf8::from_str("\u{3000}"),
896 &[CharBoundary, Interior, Interior, CharBoundary, OutOfBounds],
897 );
898 consume(
899 Wtf8::from_str("\u{30000}"),
900 &[CharBoundary, FourByteSeq1, FourByteSeq2, FourByteSeq3, CharBoundary, OutOfBounds],
901 );
902 consume(
903 unsafe { Wtf8::from_bytes_unchecked(b"\xed\xbf\xbf\xed\xa0\x80") },
904 &[
905 CharBoundary, Interior, Interior,
906 CharBoundary, Interior, Interior,
907 CharBoundary, OutOfBounds,
908 ],
909 );
910 consume(
911 unsafe { Wtf8::from_bytes_unchecked(b"\x90\x80\x80\xf0\x90\x80\x80\xf0\x90\x80") },
912 &[
913 CharBoundary, Interior, Interior,
914 CharBoundary, FourByteSeq1, FourByteSeq2, FourByteSeq3,
915 CharBoundary, Interior, Interior,
916 CharBoundary, OutOfBounds,
917 ],
918 );
919 }
920}