1#![no_std]
37#![allow(clippy::precedence, clippy::match_overlapping_arm)]
38
39extern crate alloc;
40
41use alloc::borrow::{Cow, ToOwned};
42use alloc::boxed::Box;
43use alloc::collections::TryReserveError;
44use alloc::string::String;
45use alloc::vec::Vec;
46use core::borrow::Borrow;
47use core::fmt;
48use core::hash::{Hash, Hasher};
49use core::iter::FusedIterator;
50use core::mem;
51use core::ops;
52use core::slice;
53use core::str;
54use core_char::MAX_LEN_UTF8;
55use core_char::{MAX_LEN_UTF16, encode_utf8_raw, encode_utf16_raw, len_utf8};
56use core_str::{next_code_point, next_code_point_reverse};
57use itertools::{Either, Itertools};
58
59use bstr::{ByteSlice, ByteVec};
60
61mod core_char;
62mod core_str;
63mod core_str_count;
64
65const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
66
67#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
73pub struct CodePoint {
74 value: u32,
75}
76
77impl fmt::Debug for CodePoint {
80 #[inline]
81 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
82 write!(formatter, "U+{:04X}", self.value)
83 }
84}
85
86impl fmt::Display for CodePoint {
87 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
88 self.to_char_lossy().fmt(f)
89 }
90}
91
92impl CodePoint {
93 #[inline]
99 pub const unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
100 CodePoint { value }
101 }
102
103 #[inline]
107 pub const fn from_u32(value: u32) -> Option<CodePoint> {
108 match value {
109 0..=0x10FFFF => Some(CodePoint { value }),
110 _ => None,
111 }
112 }
113
114 #[inline]
118 pub const fn from_char(value: char) -> CodePoint {
119 CodePoint {
120 value: value as u32,
121 }
122 }
123
124 #[inline]
126 pub const fn to_u32(self) -> u32 {
127 self.value
128 }
129
130 #[inline]
132 pub const fn to_lead_surrogate(self) -> Option<LeadSurrogate> {
133 match self.value {
134 lead @ 0xD800..=0xDBFF => Some(LeadSurrogate(lead as u16)),
135 _ => None,
136 }
137 }
138
139 #[inline]
141 pub const fn to_trail_surrogate(self) -> Option<TrailSurrogate> {
142 match self.value {
143 trail @ 0xDC00..=0xDFFF => Some(TrailSurrogate(trail as u16)),
144 _ => None,
145 }
146 }
147
148 #[inline]
152 pub const fn to_char(self) -> Option<char> {
153 match self.value {
154 0xD800..=0xDFFF => None,
155 _ => Some(unsafe { char::from_u32_unchecked(self.value) }),
156 }
157 }
158
159 #[inline]
164 pub fn to_char_lossy(self) -> char {
165 self.to_char().unwrap_or('\u{FFFD}')
166 }
167
168 pub fn is_char_and(self, f: impl FnOnce(char) -> bool) -> bool {
169 self.to_char().is_some_and(f)
170 }
171
172 pub fn encode_wtf8(self, dst: &mut [u8]) -> &mut Wtf8 {
173 unsafe { Wtf8::from_mut_bytes_unchecked(encode_utf8_raw(self.value, dst)) }
174 }
175
176 pub const fn len_wtf8(&self) -> usize {
177 len_utf8(self.value)
178 }
179
180 pub fn is_ascii(&self) -> bool {
181 self.is_char_and(|c| c.is_ascii())
182 }
183}
184
185impl From<u16> for CodePoint {
186 fn from(value: u16) -> Self {
187 unsafe { Self::from_u32_unchecked(value.into()) }
188 }
189}
190
191impl From<u8> for CodePoint {
192 fn from(value: u8) -> Self {
193 char::from(value).into()
194 }
195}
196
197impl From<char> for CodePoint {
198 fn from(value: char) -> Self {
199 Self::from_char(value)
200 }
201}
202
203impl From<ascii::AsciiChar> for CodePoint {
204 fn from(value: ascii::AsciiChar) -> Self {
205 Self::from_char(value.into())
206 }
207}
208
209impl From<CodePoint> for Wtf8Buf {
210 fn from(ch: CodePoint) -> Self {
211 ch.encode_wtf8(&mut [0; MAX_LEN_UTF8]).to_owned()
212 }
213}
214
215impl PartialEq<char> for CodePoint {
216 fn eq(&self, other: &char) -> bool {
217 self.to_u32() == *other as u32
218 }
219}
220impl PartialEq<CodePoint> for char {
221 fn eq(&self, other: &CodePoint) -> bool {
222 *self as u32 == other.to_u32()
223 }
224}
225
226#[derive(Clone, Copy)]
227pub struct LeadSurrogate(u16);
228
229#[derive(Clone, Copy)]
230pub struct TrailSurrogate(u16);
231
232impl LeadSurrogate {
233 pub const fn merge(self, trail: TrailSurrogate) -> char {
234 decode_surrogate_pair(self.0, trail.0)
235 }
236}
237
238#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Default)]
243pub struct Wtf8Buf {
244 bytes: Vec<u8>,
245}
246
247impl ops::Deref for Wtf8Buf {
248 type Target = Wtf8;
249
250 fn deref(&self) -> &Wtf8 {
251 self.as_slice()
252 }
253}
254
255impl ops::DerefMut for Wtf8Buf {
256 fn deref_mut(&mut self) -> &mut Wtf8 {
257 self.as_mut_slice()
258 }
259}
260
261impl Borrow<Wtf8> for Wtf8Buf {
262 fn borrow(&self) -> &Wtf8 {
263 self
264 }
265}
266
267impl fmt::Debug for Wtf8Buf {
274 #[inline]
275 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
276 fmt::Debug::fmt(&**self, formatter)
277 }
278}
279
280impl fmt::Display for Wtf8Buf {
283 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
284 fmt::Display::fmt(&**self, formatter)
285 }
286}
287
288impl Wtf8Buf {
289 #[inline]
291 pub fn new() -> Wtf8Buf {
292 Wtf8Buf::default()
293 }
294
295 #[inline]
297 pub fn with_capacity(capacity: usize) -> Wtf8Buf {
298 Wtf8Buf {
299 bytes: Vec::with_capacity(capacity),
300 }
301 }
302
303 #[inline]
309 pub const unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
310 Wtf8Buf { bytes: value }
311 }
312
313 pub fn from_bytes(value: Vec<u8>) -> Result<Self, Vec<u8>> {
315 match Wtf8::from_bytes(&value) {
316 Some(_) => Ok(unsafe { Self::from_bytes_unchecked(value) }),
317 None => Err(value),
318 }
319 }
320
321 #[inline]
327 pub fn from_string(string: String) -> Wtf8Buf {
328 Wtf8Buf {
329 bytes: string.into_bytes(),
330 }
331 }
332
333 pub fn join<I, S>(sep: impl AsRef<Wtf8>, iter: I) -> Wtf8Buf
334 where
335 I: IntoIterator<Item = S>,
336 S: AsRef<Wtf8>,
337 {
338 let sep = sep.as_ref();
339 let mut iter = iter.into_iter();
340 let mut buf = match iter.next() {
341 Some(first) => first.as_ref().to_owned(),
342 None => return Wtf8Buf::new(),
343 };
344 for part in iter {
345 buf.push_wtf8(sep);
346 buf.push_wtf8(part.as_ref());
347 }
348 buf
349 }
350
351 pub fn clear(&mut self) {
352 self.bytes.clear();
353 }
354
355 pub fn from_wide(v: &[u16]) -> Wtf8Buf {
360 let mut string = Wtf8Buf::with_capacity(v.len());
361 for item in char::decode_utf16(v.iter().cloned()) {
362 match item {
363 Ok(ch) => string.push_char(ch),
364 Err(surrogate) => {
365 let surrogate = surrogate.unpaired_surrogate();
366 let code_point = CodePoint::from(surrogate);
368 string.push(code_point);
371 }
372 }
373 }
374 string
375 }
376
377 #[inline]
378 pub fn as_slice(&self) -> &Wtf8 {
379 unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
380 }
381
382 #[inline]
383 pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
384 unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
388 }
389
390 #[inline]
398 pub fn reserve(&mut self, additional: usize) {
399 self.bytes.reserve(additional)
400 }
401
402 #[inline]
414 pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
415 self.bytes.try_reserve(additional)
416 }
417
418 #[inline]
419 pub fn reserve_exact(&mut self, additional: usize) {
420 self.bytes.reserve_exact(additional)
421 }
422
423 #[inline]
440 pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
441 self.bytes.try_reserve_exact(additional)
442 }
443
444 #[inline]
445 pub fn shrink_to_fit(&mut self) {
446 self.bytes.shrink_to_fit()
447 }
448
449 #[inline]
450 pub fn shrink_to(&mut self, min_capacity: usize) {
451 self.bytes.shrink_to(min_capacity)
452 }
453
454 #[inline]
455 pub fn leak<'a>(self) -> &'a mut Wtf8 {
456 unsafe { Wtf8::from_mut_bytes_unchecked(self.bytes.leak()) }
457 }
458
459 #[inline]
461 pub const fn capacity(&self) -> usize {
462 self.bytes.capacity()
463 }
464
465 #[inline]
467 pub fn push_str(&mut self, other: &str) {
468 self.bytes.extend_from_slice(other.as_bytes())
469 }
470
471 #[inline]
473 pub fn push_wtf8(&mut self, other: &Wtf8) {
474 self.bytes.extend_from_slice(&other.bytes);
475 }
476
477 #[inline]
479 pub fn push_char(&mut self, c: char) {
480 self.push(CodePoint::from_char(c))
481 }
482
483 #[inline]
485 pub fn push(&mut self, code_point: CodePoint) {
486 self.push_wtf8(code_point.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
487 }
488
489 pub fn pop(&mut self) -> Option<CodePoint> {
490 let ch = self.code_points().next_back()?;
491 let new_len = self.len() - ch.len_wtf8();
492 self.bytes.truncate(new_len);
493 Some(ch)
494 }
495
496 #[inline]
503 pub fn truncate(&mut self, new_len: usize) {
504 assert!(is_code_point_boundary(self, new_len));
505 self.bytes.truncate(new_len)
506 }
507
508 #[inline]
510 pub fn insert(&mut self, idx: usize, c: CodePoint) {
511 self.insert_wtf8(idx, c.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
512 }
513
514 #[inline]
516 pub fn insert_wtf8(&mut self, idx: usize, w: &Wtf8) {
517 assert!(is_code_point_boundary(self, idx));
518
519 self.bytes.insert_str(idx, w)
520 }
521
522 #[inline]
524 pub fn into_bytes(self) -> Vec<u8> {
525 self.bytes
526 }
527
528 pub fn into_string(self) -> Result<String, Wtf8Buf> {
536 if self.is_utf8() {
537 Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
538 } else {
539 Err(self)
540 }
541 }
542
543 pub fn into_string_lossy(mut self) -> String {
549 let mut pos = 0;
550 while let Some((surrogate_pos, _)) = self.next_surrogate(pos) {
551 pos = surrogate_pos + 3;
552 self.bytes[surrogate_pos..pos].copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
555 }
556 unsafe { String::from_utf8_unchecked(self.bytes) }
557 }
558
559 #[inline]
561 pub fn into_box(self) -> Box<Wtf8> {
562 unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
564 }
565
566 pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
568 let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
569 Wtf8Buf {
570 bytes: bytes.into_vec(),
571 }
572 }
573}
574
575impl FromIterator<CodePoint> for Wtf8Buf {
580 fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
581 let mut string = Wtf8Buf::new();
582 string.extend(iter);
583 string
584 }
585}
586
587impl Extend<CodePoint> for Wtf8Buf {
592 fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
593 let iterator = iter.into_iter();
594 let (low, _high) = iterator.size_hint();
595 self.bytes.reserve(low);
597 iterator.for_each(move |code_point| self.push(code_point));
598 }
599}
600
601impl Extend<char> for Wtf8Buf {
602 fn extend<T: IntoIterator<Item = char>>(&mut self, iter: T) {
603 self.extend(iter.into_iter().map(CodePoint::from))
604 }
605}
606
607impl<W: AsRef<Wtf8>> Extend<W> for Wtf8Buf {
608 fn extend<T: IntoIterator<Item = W>>(&mut self, iter: T) {
609 iter.into_iter()
610 .for_each(move |w| self.push_wtf8(w.as_ref()));
611 }
612}
613
614impl<W: AsRef<Wtf8>> FromIterator<W> for Wtf8Buf {
615 fn from_iter<T: IntoIterator<Item = W>>(iter: T) -> Self {
616 let mut buf = Wtf8Buf::new();
617 iter.into_iter().for_each(|w| buf.push_wtf8(w.as_ref()));
618 buf
619 }
620}
621
622impl Hash for Wtf8Buf {
623 fn hash<H: Hasher>(&self, state: &mut H) {
624 Wtf8::hash(self, state)
625 }
626}
627
628impl AsRef<Wtf8> for Wtf8Buf {
629 fn as_ref(&self) -> &Wtf8 {
630 self
631 }
632}
633
634impl From<String> for Wtf8Buf {
635 fn from(s: String) -> Self {
636 Wtf8Buf::from_string(s)
637 }
638}
639
640impl From<&str> for Wtf8Buf {
641 fn from(s: &str) -> Self {
642 Wtf8Buf::from_string(s.to_owned())
643 }
644}
645
646impl From<ascii::AsciiString> for Wtf8Buf {
647 fn from(s: ascii::AsciiString) -> Self {
648 Wtf8Buf::from_string(s.into())
649 }
650}
651
652#[derive(PartialEq, Eq, PartialOrd, Ord)]
657pub struct Wtf8 {
658 bytes: [u8],
659}
660
661impl AsRef<Wtf8> for Wtf8 {
662 fn as_ref(&self) -> &Wtf8 {
663 self
664 }
665}
666
667impl ToOwned for Wtf8 {
668 type Owned = Wtf8Buf;
669
670 fn to_owned(&self) -> Self::Owned {
671 self.to_wtf8_buf()
672 }
673
674 fn clone_into(&self, buf: &mut Self::Owned) {
675 self.bytes.clone_into(&mut buf.bytes);
676 }
677}
678
679impl PartialEq<str> for Wtf8 {
680 fn eq(&self, other: &str) -> bool {
681 self.as_bytes().eq(other.as_bytes())
682 }
683}
684
685impl fmt::Debug for Wtf8 {
689 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
690 fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
691 use core::fmt::Write;
692 for c in s.chars().flat_map(|c| c.escape_debug()) {
693 f.write_char(c)?
694 }
695 Ok(())
696 }
697
698 formatter.write_str("\"")?;
699 let mut pos = 0;
700 while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
701 write_str_escaped(formatter, unsafe {
702 str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
703 })?;
704 write!(formatter, "\\u{{{surrogate:x}}}")?;
705 pos = surrogate_pos + 3;
706 }
707 write_str_escaped(formatter, unsafe {
708 str::from_utf8_unchecked(&self.bytes[pos..])
709 })?;
710 formatter.write_str("\"")
711 }
712}
713
714impl fmt::Display for Wtf8 {
717 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
718 let wtf8_bytes = &self.bytes;
719 let mut pos = 0;
720 loop {
721 match self.next_surrogate(pos) {
722 Some((surrogate_pos, _)) => {
723 formatter.write_str(unsafe {
724 str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
725 })?;
726 formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
727 pos = surrogate_pos + 3;
728 }
729 None => {
730 let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
731 if pos == 0 {
732 return s.fmt(formatter);
733 } else {
734 return formatter.write_str(s);
735 }
736 }
737 }
738 }
739 }
740}
741
742impl Default for &Wtf8 {
743 fn default() -> Self {
744 unsafe { Wtf8::from_bytes_unchecked(&[]) }
745 }
746}
747
748impl Hash for Wtf8 {
749 fn hash<H: Hasher>(&self, state: &mut H) {
750 state.write(self.as_bytes());
751 state.write_u8(0xff);
752 }
753}
754
755impl Wtf8 {
756 #[inline]
760 pub fn new<S: AsRef<Wtf8> + ?Sized>(value: &S) -> &Wtf8 {
761 value.as_ref()
762 }
763
764 #[inline]
770 pub const unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
771 unsafe { &*(value as *const [u8] as *const Wtf8) }
773 }
774
775 #[inline]
780 const unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
781 unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
783 }
784
785 #[inline]
789 pub fn from_bytes(b: &[u8]) -> Option<&Self> {
790 let mut rest = b;
791 while let Err(e) = core::str::from_utf8(rest) {
792 rest = &rest[e.valid_up_to()..];
793 let _ = Self::decode_surrogate(rest)?;
794 rest = &rest[3..];
795 }
796 Some(unsafe { Wtf8::from_bytes_unchecked(b) })
797 }
798
799 fn decode_surrogate(b: &[u8]) -> Option<CodePoint> {
800 let [0xed, b2 @ (0xa0..), b3, ..] = *b else {
801 return None;
802 };
803 Some(decode_surrogate(b2, b3).into())
804 }
805
806 #[inline]
808 pub const fn len(&self) -> usize {
809 self.bytes.len()
810 }
811
812 #[inline]
813 pub const fn is_empty(&self) -> bool {
814 self.bytes.is_empty()
815 }
816
817 #[inline]
824 pub const fn ascii_byte_at(&self, position: usize) -> u8 {
825 match self.bytes[position] {
826 ascii_byte @ 0x00..=0x7F => ascii_byte,
827 _ => 0xFF,
828 }
829 }
830
831 #[inline]
833 pub fn code_points(&self) -> Wtf8CodePoints<'_> {
834 Wtf8CodePoints {
835 bytes: self.bytes.iter(),
836 }
837 }
838
839 #[inline]
841 pub fn code_point_indices(&self) -> Wtf8CodePointIndices<'_> {
842 Wtf8CodePointIndices {
843 front_offset: 0,
844 iter: self.code_points(),
845 }
846 }
847
848 #[inline]
850 pub const fn as_bytes(&self) -> &[u8] {
851 &self.bytes
852 }
853
854 #[inline]
860 pub const fn as_str(&self) -> Result<&str, str::Utf8Error> {
861 str::from_utf8(&self.bytes)
862 }
863
864 pub fn to_wtf8_buf(&self) -> Wtf8Buf {
866 Wtf8Buf {
867 bytes: self.bytes.to_vec(),
868 }
869 }
870
871 pub fn to_string_lossy(&self) -> Cow<'_, str> {
878 let Some((surrogate_pos, _)) = self.next_surrogate(0) else {
879 return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) });
880 };
881 let wtf8_bytes = &self.bytes;
882 let mut utf8_bytes = Vec::with_capacity(self.len());
883 utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
884 utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
885 let mut pos = surrogate_pos + 3;
886 loop {
887 match self.next_surrogate(pos) {
888 Some((surrogate_pos, _)) => {
889 utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
890 utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
891 pos = surrogate_pos + 3;
892 }
893 None => {
894 utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
895 return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
896 }
897 }
898 }
899 }
900
901 #[inline]
908 pub fn encode_wide(&self) -> EncodeWide<'_> {
909 EncodeWide {
910 code_points: self.code_points(),
911 extra: 0,
912 }
913 }
914
915 pub const fn chunks(&self) -> Wtf8Chunks<'_> {
916 Wtf8Chunks { wtf8: self }
917 }
918
919 pub fn map_utf8<'a, I>(&'a self, f: impl Fn(&'a str) -> I) -> impl Iterator<Item = CodePoint>
920 where
921 I: Iterator<Item = char>,
922 {
923 self.chunks().flat_map(move |chunk| match chunk {
924 Wtf8Chunk::Utf8(s) => Either::Left(f(s).map_into()),
925 Wtf8Chunk::Surrogate(c) => Either::Right(core::iter::once(c)),
926 })
927 }
928
929 #[inline]
930 fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
931 let mut iter = self.bytes[pos..].iter();
932 loop {
933 let b = *iter.next()?;
934 if b < 0x80 {
935 pos += 1;
936 } else if b < 0xE0 {
937 iter.next();
938 pos += 2;
939 } else if b == 0xED {
940 match (iter.next(), iter.next()) {
941 (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
942 return Some((pos, decode_surrogate(b2, b3)));
943 }
944 _ => pos += 3,
945 }
946 } else if b < 0xF0 {
947 iter.next();
948 iter.next();
949 pos += 3;
950 } else {
951 iter.next();
952 iter.next();
953 iter.next();
954 pos += 4;
955 }
956 }
957 }
958
959 pub fn is_code_point_boundary(&self, index: usize) -> bool {
960 is_code_point_boundary(self, index)
961 }
962
963 #[inline]
965 pub fn into_box(&self) -> Box<Wtf8> {
966 let boxed: Box<[u8]> = self.bytes.into();
967 unsafe { mem::transmute(boxed) }
968 }
969
970 pub fn empty_box() -> Box<Wtf8> {
972 let boxed: Box<[u8]> = Default::default();
973 unsafe { mem::transmute(boxed) }
974 }
975
976 #[inline]
977 pub fn make_ascii_lowercase(&mut self) {
978 self.bytes.make_ascii_lowercase()
979 }
980
981 #[inline]
982 pub fn make_ascii_uppercase(&mut self) {
983 self.bytes.make_ascii_uppercase()
984 }
985
986 #[inline]
987 pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
988 Wtf8Buf {
989 bytes: self.bytes.to_ascii_lowercase(),
990 }
991 }
992
993 #[inline]
994 pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
995 Wtf8Buf {
996 bytes: self.bytes.to_ascii_uppercase(),
997 }
998 }
999
1000 pub fn to_lowercase(&self) -> Wtf8Buf {
1001 let mut buf = Wtf8Buf::with_capacity(self.len());
1002 for chunk in self.chunks() {
1003 match chunk {
1004 Wtf8Chunk::Utf8(s) => buf.push_str(&s.to_lowercase()),
1005 Wtf8Chunk::Surrogate(c) => buf.push(c),
1006 }
1007 }
1008 buf
1009 }
1010
1011 pub fn to_uppercase(&self) -> Wtf8Buf {
1012 let mut buf = Wtf8Buf::with_capacity(self.len());
1013 for chunk in self.chunks() {
1014 match chunk {
1015 Wtf8Chunk::Utf8(s) => buf.push_str(&s.to_uppercase()),
1016 Wtf8Chunk::Surrogate(c) => buf.push(c),
1017 }
1018 }
1019 buf
1020 }
1021
1022 #[inline]
1023 pub const fn is_ascii(&self) -> bool {
1024 self.bytes.is_ascii()
1025 }
1026
1027 #[inline]
1028 pub fn is_utf8(&self) -> bool {
1029 self.next_surrogate(0).is_none()
1030 }
1031
1032 #[inline]
1033 pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
1034 self.bytes.eq_ignore_ascii_case(&other.bytes)
1035 }
1036
1037 pub fn split(&self, pat: &Wtf8) -> impl Iterator<Item = &Self> {
1038 self.as_bytes()
1039 .split_str(pat)
1040 .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1041 }
1042
1043 pub fn splitn(&self, n: usize, pat: &Wtf8) -> impl Iterator<Item = &Self> {
1044 self.as_bytes()
1045 .splitn_str(n, pat)
1046 .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1047 }
1048
1049 pub fn rsplit(&self, pat: &Wtf8) -> impl Iterator<Item = &Self> {
1050 self.as_bytes()
1051 .rsplit_str(pat)
1052 .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1053 }
1054
1055 pub fn rsplitn(&self, n: usize, pat: &Wtf8) -> impl Iterator<Item = &Self> {
1056 self.as_bytes()
1057 .rsplitn_str(n, pat)
1058 .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1059 }
1060
1061 pub fn trim(&self) -> &Self {
1062 let w = self.bytes.trim();
1063 unsafe { Wtf8::from_bytes_unchecked(w) }
1064 }
1065
1066 pub fn trim_start(&self) -> &Self {
1067 let w = self.bytes.trim_start();
1068 unsafe { Wtf8::from_bytes_unchecked(w) }
1069 }
1070
1071 pub fn trim_end(&self) -> &Self {
1072 let w = self.bytes.trim_end();
1073 unsafe { Wtf8::from_bytes_unchecked(w) }
1074 }
1075
1076 pub fn trim_start_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
1077 let mut iter = self.code_points();
1078 loop {
1079 let old = iter.clone();
1080 match iter.next().map(&f) {
1081 Some(true) => continue,
1082 Some(false) => {
1083 iter = old;
1084 break;
1085 }
1086 None => return iter.as_wtf8(),
1087 }
1088 }
1089 iter.as_wtf8()
1090 }
1091
1092 pub fn trim_end_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
1093 let mut iter = self.code_points();
1094 loop {
1095 let old = iter.clone();
1096 match iter.next_back().map(&f) {
1097 Some(true) => continue,
1098 Some(false) => {
1099 iter = old;
1100 break;
1101 }
1102 None => return iter.as_wtf8(),
1103 }
1104 }
1105 iter.as_wtf8()
1106 }
1107
1108 pub fn trim_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
1109 self.trim_start_matches(&f).trim_end_matches(&f)
1110 }
1111
1112 pub fn find(&self, pat: &Wtf8) -> Option<usize> {
1113 memchr::memmem::find(self.as_bytes(), pat.as_bytes())
1114 }
1115
1116 pub fn rfind(&self, pat: &Wtf8) -> Option<usize> {
1117 memchr::memmem::rfind(self.as_bytes(), pat.as_bytes())
1118 }
1119
1120 pub fn find_iter(&self, pat: &Wtf8) -> impl Iterator<Item = usize> {
1121 memchr::memmem::find_iter(self.as_bytes(), pat.as_bytes())
1122 }
1123
1124 pub fn rfind_iter(&self, pat: &Wtf8) -> impl Iterator<Item = usize> {
1125 memchr::memmem::rfind_iter(self.as_bytes(), pat.as_bytes())
1126 }
1127
1128 pub fn contains(&self, pat: &Wtf8) -> bool {
1129 self.bytes.contains_str(pat)
1130 }
1131
1132 pub fn contains_code_point(&self, pat: CodePoint) -> bool {
1133 self.bytes
1134 .contains_str(pat.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
1135 }
1136
1137 pub fn get(&self, range: impl ops::RangeBounds<usize>) -> Option<&Self> {
1138 let start = match range.start_bound() {
1139 ops::Bound::Included(&i) => i,
1140 ops::Bound::Excluded(&i) => i.saturating_add(1),
1141 ops::Bound::Unbounded => 0,
1142 };
1143 let end = match range.end_bound() {
1144 ops::Bound::Included(&i) => i.saturating_add(1),
1145 ops::Bound::Excluded(&i) => i,
1146 ops::Bound::Unbounded => self.len(),
1147 };
1148 if start <= end && is_code_point_boundary(self, start) && is_code_point_boundary(self, end)
1150 {
1151 Some(unsafe { slice_unchecked(self, start, end) })
1152 } else {
1153 None
1154 }
1155 }
1156
1157 pub fn ends_with(&self, w: impl AsRef<Wtf8>) -> bool {
1158 self.bytes.ends_with_str(w.as_ref())
1159 }
1160
1161 pub fn starts_with(&self, w: impl AsRef<Wtf8>) -> bool {
1162 self.bytes.starts_with_str(w.as_ref())
1163 }
1164
1165 pub fn strip_prefix(&self, w: impl AsRef<Wtf8>) -> Option<&Self> {
1166 self.bytes
1167 .strip_prefix(w.as_ref().as_bytes())
1168 .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1169 }
1170
1171 pub fn strip_suffix(&self, w: impl AsRef<Wtf8>) -> Option<&Self> {
1172 self.bytes
1173 .strip_suffix(w.as_ref().as_bytes())
1174 .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1175 }
1176
1177 pub fn replace(&self, from: &Wtf8, to: &Wtf8) -> Wtf8Buf {
1178 let w = self.bytes.replace(from, to);
1179 unsafe { Wtf8Buf::from_bytes_unchecked(w) }
1180 }
1181
1182 pub fn replacen(&self, from: &Wtf8, to: &Wtf8, n: usize) -> Wtf8Buf {
1183 let w = self.bytes.replacen(from, to, n);
1184 unsafe { Wtf8Buf::from_bytes_unchecked(w) }
1185 }
1186}
1187
1188impl AsRef<Wtf8> for str {
1189 fn as_ref(&self) -> &Wtf8 {
1190 unsafe { Wtf8::from_bytes_unchecked(self.as_bytes()) }
1191 }
1192}
1193
1194impl AsRef<[u8]> for Wtf8 {
1195 fn as_ref(&self) -> &[u8] {
1196 self.as_bytes()
1197 }
1198}
1199
1200impl ops::Index<ops::Range<usize>> for Wtf8 {
1207 type Output = Wtf8;
1208
1209 #[inline]
1210 #[track_caller]
1211 fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
1212 if range.start <= range.end
1214 && is_code_point_boundary(self, range.start)
1215 && is_code_point_boundary(self, range.end)
1216 {
1217 unsafe { slice_unchecked(self, range.start, range.end) }
1218 } else {
1219 slice_error_fail(self, range.start, range.end)
1220 }
1221 }
1222}
1223
1224impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
1231 type Output = Wtf8;
1232
1233 #[inline]
1234 #[track_caller]
1235 fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
1236 if is_code_point_boundary(self, range.start) {
1238 unsafe { slice_unchecked(self, range.start, self.len()) }
1239 } else {
1240 slice_error_fail(self, range.start, self.len())
1241 }
1242 }
1243}
1244
1245impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
1252 type Output = Wtf8;
1253
1254 #[inline]
1255 #[track_caller]
1256 fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
1257 if is_code_point_boundary(self, range.end) {
1259 unsafe { slice_unchecked(self, 0, range.end) }
1260 } else {
1261 slice_error_fail(self, 0, range.end)
1262 }
1263 }
1264}
1265
1266impl ops::Index<ops::RangeFull> for Wtf8 {
1267 type Output = Wtf8;
1268
1269 #[inline]
1270 fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
1271 self
1272 }
1273}
1274
1275#[inline]
1276const fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
1277 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
1279}
1280
1281#[inline]
1282const fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
1283 let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
1284 unsafe { char::from_u32_unchecked(code_point) }
1285}
1286
1287#[inline]
1289fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
1290 if index == 0 {
1291 return true;
1292 }
1293 match slice.bytes.get(index) {
1294 None => index == slice.len(),
1295 Some(&b) => (b as i8) >= -0x40,
1296 }
1297}
1298
1299#[track_caller]
1307#[inline]
1308pub fn check_utf8_boundary(slice: &Wtf8, index: usize) {
1309 if index == 0 {
1310 return;
1311 }
1312 match slice.bytes.get(index) {
1313 Some(0xED) => (), Some(&b) if (b as i8) >= -0x40 => return,
1315 Some(_) => panic!("byte index {index} is not a codepoint boundary"),
1316 None if index == slice.len() => return,
1317 None => panic!("byte index {index} is out of bounds"),
1318 }
1319 if slice.bytes[index + 1] >= 0xA0 {
1320 if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 {
1322 panic!("byte index {index} lies between surrogate codepoints");
1323 }
1324 }
1325}
1326
1327#[inline]
1333pub const unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
1334 unsafe {
1336 let len = end - begin;
1337 let start = s.as_bytes().as_ptr().add(begin);
1338 Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len))
1339 }
1340}
1341
1342#[inline(never)]
1344#[track_caller]
1345pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
1346 assert!(begin <= end);
1347 panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary");
1348}
1349
1350#[derive(Clone)]
1354pub struct Wtf8CodePoints<'a> {
1355 bytes: slice::Iter<'a, u8>,
1356}
1357
1358impl Iterator for Wtf8CodePoints<'_> {
1359 type Item = CodePoint;
1360
1361 #[inline]
1362 fn next(&mut self) -> Option<CodePoint> {
1363 unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
1365 }
1366
1367 #[inline]
1368 fn size_hint(&self) -> (usize, Option<usize>) {
1369 let len = self.bytes.len();
1370 (len.saturating_add(3) / 4, Some(len))
1371 }
1372
1373 fn last(mut self) -> Option<Self::Item> {
1374 self.next_back()
1375 }
1376
1377 fn count(self) -> usize {
1378 core_str_count::count_chars(self.as_wtf8())
1379 }
1380}
1381
1382impl DoubleEndedIterator for Wtf8CodePoints<'_> {
1383 #[inline]
1384 fn next_back(&mut self) -> Option<CodePoint> {
1385 unsafe {
1388 next_code_point_reverse(&mut self.bytes).map(|ch| CodePoint::from_u32_unchecked(ch))
1389 }
1390 }
1391}
1392
1393impl<'a> Wtf8CodePoints<'a> {
1394 pub fn as_wtf8(&self) -> &'a Wtf8 {
1395 unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) }
1396 }
1397}
1398
1399#[derive(Clone)]
1400pub struct Wtf8CodePointIndices<'a> {
1401 front_offset: usize,
1402 iter: Wtf8CodePoints<'a>,
1403}
1404
1405impl Iterator for Wtf8CodePointIndices<'_> {
1406 type Item = (usize, CodePoint);
1407
1408 #[inline]
1409 fn next(&mut self) -> Option<(usize, CodePoint)> {
1410 let pre_len = self.iter.bytes.len();
1411 match self.iter.next() {
1412 None => None,
1413 Some(ch) => {
1414 let index = self.front_offset;
1415 let len = self.iter.bytes.len();
1416 self.front_offset += pre_len - len;
1417 Some((index, ch))
1418 }
1419 }
1420 }
1421
1422 #[inline]
1423 fn size_hint(&self) -> (usize, Option<usize>) {
1424 self.iter.size_hint()
1425 }
1426
1427 #[inline]
1428 fn last(mut self) -> Option<(usize, CodePoint)> {
1429 self.next_back()
1431 }
1432
1433 #[inline]
1434 fn count(self) -> usize {
1435 self.iter.count()
1436 }
1437}
1438
1439impl DoubleEndedIterator for Wtf8CodePointIndices<'_> {
1440 #[inline]
1441 fn next_back(&mut self) -> Option<(usize, CodePoint)> {
1442 self.iter.next_back().map(|ch| {
1443 let index = self.front_offset + self.iter.bytes.len();
1444 (index, ch)
1445 })
1446 }
1447}
1448
1449impl FusedIterator for Wtf8CodePointIndices<'_> {}
1450
1451#[derive(Clone)]
1453pub struct EncodeWide<'a> {
1454 code_points: Wtf8CodePoints<'a>,
1455 extra: u16,
1456}
1457
1458impl Iterator for EncodeWide<'_> {
1460 type Item = u16;
1461
1462 #[inline]
1463 fn next(&mut self) -> Option<u16> {
1464 if self.extra != 0 {
1465 let tmp = self.extra;
1466 self.extra = 0;
1467 return Some(tmp);
1468 }
1469
1470 let mut buf = [0; MAX_LEN_UTF16];
1471 self.code_points.next().map(|code_point| {
1472 let n = encode_utf16_raw(code_point.value, &mut buf).len();
1473 if n == 2 {
1474 self.extra = buf[1];
1475 }
1476 buf[0]
1477 })
1478 }
1479
1480 #[inline]
1481 fn size_hint(&self) -> (usize, Option<usize>) {
1482 let (low, high) = self.code_points.size_hint();
1483 let ext = (self.extra != 0) as usize;
1484 (
1488 low + ext,
1489 high.and_then(|n| n.checked_mul(2))
1490 .and_then(|n| n.checked_add(ext)),
1491 )
1492 }
1493}
1494
1495impl FusedIterator for EncodeWide<'_> {}
1496
1497pub struct Wtf8Chunks<'a> {
1498 wtf8: &'a Wtf8,
1499}
1500
1501impl<'a> Iterator for Wtf8Chunks<'a> {
1502 type Item = Wtf8Chunk<'a>;
1503
1504 fn next(&mut self) -> Option<Self::Item> {
1505 match self.wtf8.next_surrogate(0) {
1506 Some((0, surrogate)) => {
1507 self.wtf8 = &self.wtf8[3..];
1508 Some(Wtf8Chunk::Surrogate(surrogate.into()))
1509 }
1510 Some((n, _)) => {
1511 let s = unsafe { str::from_utf8_unchecked(&self.wtf8.as_bytes()[..n]) };
1512 self.wtf8 = &self.wtf8[n..];
1513 Some(Wtf8Chunk::Utf8(s))
1514 }
1515 None => {
1516 let s =
1517 unsafe { str::from_utf8_unchecked(core::mem::take(&mut self.wtf8).as_bytes()) };
1518 (!s.is_empty()).then_some(Wtf8Chunk::Utf8(s))
1519 }
1520 }
1521 }
1522}
1523
1524pub enum Wtf8Chunk<'a> {
1525 Utf8(&'a str),
1526 Surrogate(CodePoint),
1527}
1528
1529impl Hash for CodePoint {
1530 #[inline]
1531 fn hash<H: Hasher>(&self, state: &mut H) {
1532 self.value.hash(state)
1533 }
1534}
1535
1536pub unsafe fn from_boxed_wtf8_unchecked(value: Box<[u8]>) -> Box<Wtf8> {
1542 unsafe { Box::from_raw(Box::into_raw(value) as *mut Wtf8) }
1543}
1544
1545impl Clone for Box<Wtf8> {
1546 fn clone(&self) -> Self {
1547 (&**self).into()
1548 }
1549}
1550
1551impl Default for Box<Wtf8> {
1552 fn default() -> Self {
1553 unsafe { from_boxed_wtf8_unchecked(Box::default()) }
1554 }
1555}
1556
1557impl From<&Wtf8> for Box<Wtf8> {
1558 fn from(w: &Wtf8) -> Self {
1559 w.into_box()
1560 }
1561}
1562
1563impl<'a> From<&'a str> for &'a Wtf8 {
1564 #[inline]
1565 fn from(s: &'a str) -> &'a Wtf8 {
1566 unsafe { Wtf8::from_bytes_unchecked(s.as_bytes()) }
1568 }
1569}
1570
1571impl From<&str> for Box<Wtf8> {
1572 fn from(s: &str) -> Self {
1573 Box::<str>::from(s).into()
1574 }
1575}
1576
1577impl From<Box<str>> for Box<Wtf8> {
1578 fn from(s: Box<str>) -> Self {
1579 unsafe { from_boxed_wtf8_unchecked(s.into_boxed_bytes()) }
1580 }
1581}
1582
1583impl From<Box<ascii::AsciiStr>> for Box<Wtf8> {
1584 fn from(s: Box<ascii::AsciiStr>) -> Self {
1585 <Box<str>>::from(s).into()
1586 }
1587}
1588
1589impl From<Box<Wtf8>> for Box<[u8]> {
1590 fn from(w: Box<Wtf8>) -> Self {
1591 unsafe { Box::from_raw(Box::into_raw(w) as *mut [u8]) }
1592 }
1593}
1594
1595impl From<Wtf8Buf> for Box<Wtf8> {
1596 fn from(w: Wtf8Buf) -> Self {
1597 w.into_box()
1598 }
1599}
1600
1601impl From<Box<Wtf8>> for Wtf8Buf {
1602 fn from(w: Box<Wtf8>) -> Self {
1603 Wtf8Buf::from_box(w)
1604 }
1605}
1606
1607impl From<String> for Box<Wtf8> {
1608 fn from(s: String) -> Self {
1609 s.into_boxed_str().into()
1610 }
1611}
1612
1613mod concat;
1614pub use concat::Wtf8Concat;