1#[cfg(test)]
4mod tests;
5
6use crate::wtf8buf::Wtf8Buf;
7use crate::{codepoint, decode_surrogate, CodePoint};
8use alloc::borrow::Cow;
9use alloc::boxed::Box;
10use alloc::rc::Rc;
11use alloc::sync::Arc;
12use alloc::vec::Vec;
13use core::iter::FusedIterator;
14use core::ops::Index;
15use core::{fmt, slice, str};
16
17mod index;
18
19pub use index::*;
20
21#[derive(PartialEq, Eq, PartialOrd, Ord)]
23#[repr(transparent)]
24pub struct Wtf8 {
25 bytes: [u8],
26}
27
28impl Wtf8 {
29 #[inline]
30 pub(crate) fn bytes(&self) -> &[u8] {
31 &self.bytes
32 }
33
34 #[inline]
36 pub fn new<T: ?Sized + AsRef<Wtf8>>(x: &T) -> &Self {
37 x.as_ref()
38 }
39
40 #[inline]
42 pub fn len(&self) -> usize {
43 self.bytes.len()
44 }
45
46 #[inline]
48 pub fn is_empty(&self) -> bool {
49 self.bytes.is_empty()
50 }
51
52 #[inline]
59 pub fn ascii_byte_at(&self, position: usize) -> u8 {
60 match self.bytes[position] {
61 ascii_byte @ 0x00..=0x7F => ascii_byte,
62 _ => 0xFF,
63 }
64 }
65
66 #[inline]
68 pub fn code_points(&self) -> CodePoints<'_> {
69 CodePoints {
70 bytes: self.bytes.iter(),
71 }
72 }
73
74 pub fn to_str(&self) -> Result<&str, ToStrError> {
80 let mut chunks = self.chunks();
81
82 let x = match chunks.next() {
83 Some(Wtf8Chunk::Utf8(str)) => str,
84 Some(Wtf8Chunk::UnpairedSurrogate(_)) => return Err(ToStrError { valid_up_to: 0 }),
85 None => return Ok(""),
86 };
87
88 if chunks.next().is_some() {
89 return Err(ToStrError {
90 valid_up_to: x.len(),
91 });
92 }
93
94 Ok(x)
95 }
96
97 #[inline]
103 pub fn chunks(&self) -> Chunks {
104 Chunks(&self.bytes)
105 }
106
107 pub fn to_string_lossy(&self) -> Cow<str> {
114 let mut chunks = self.chunks();
115
116 if chunks.next_surrogate().is_none() {
117 return Cow::Borrowed(chunks.next().and_then(Wtf8Chunk::utf8).unwrap_or(""));
118 }
119
120 let chunks: Vec<_> = chunks.map(|a| a.utf8().unwrap_or("\u{FFFD}")).collect();
121 Cow::Owned(chunks.join(""))
122 }
123
124 #[inline]
128 pub fn get<I: Wtf8Index>(&self, i: I) -> Option<&Self> {
129 i.get(self)
130 }
131
132 #[inline]
135 pub fn encode_utf16(&self) -> EncodeUtf16<'_> {
136 EncodeUtf16(CodePoint::encode_utf16(self.code_points()))
137 }
138
139 #[inline]
145 pub unsafe fn get_unchecked<I: Wtf8Index>(&self, i: I) -> &Self {
146 &*i.get_unchecked(self)
147 }
148
149 #[inline]
151 pub fn is_code_point_boundary(&self, index: usize) -> bool {
152 if index == self.len() {
153 return true;
154 }
155 !matches!(self.bytes.get(index), None | Some(128..=191))
156 }
157
158 #[inline]
160 pub fn to_box(&self) -> Box<Wtf8> {
161 let boxed: Box<[u8]> = self.bytes.into();
162 unsafe { Box::from_raw(Box::into_raw(boxed) as *mut Wtf8) }
164 }
165
166 pub fn empty_box() -> Box<Wtf8> {
168 let boxed: Box<[u8]> = Default::default();
169 unsafe { Box::from_raw(Box::into_raw(boxed) as *mut Wtf8) }
171 }
172
173 #[inline]
175 pub fn to_arc(&self) -> Arc<Wtf8> {
176 let arc: Arc<[u8]> = Arc::from(&self.bytes);
177 unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) }
179 }
180
181 #[inline]
183 pub fn to_rc(&self) -> Rc<Wtf8> {
184 let rc: Rc<[u8]> = Rc::from(&self.bytes);
185 unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) }
187 }
188
189 #[inline]
199 pub fn make_ascii_lowercase(&mut self) {
200 self.bytes.make_ascii_lowercase()
201 }
202
203 #[inline]
213 pub fn make_ascii_uppercase(&mut self) {
214 self.bytes.make_ascii_uppercase()
215 }
216
217 #[inline]
223 pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
224 Wtf8Buf::from_bytes(self.bytes.to_ascii_lowercase())
225 }
226
227 #[inline]
237 pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
238 Wtf8Buf::from_bytes(self.bytes.to_ascii_uppercase())
239 }
240
241 #[inline]
243 pub fn is_ascii(&self) -> bool {
244 self.bytes.is_ascii()
245 }
246
247 #[inline]
252 pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
253 self.bytes.eq_ignore_ascii_case(&other.bytes)
254 }
255
256 #[inline]
257 pub(crate) fn initial_trail_surrogate(&self) -> Option<u16> {
258 match self.bytes {
259 [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
260 _ => None,
261 }
262 }
263}
264
265impl From<&Wtf8> for Box<Wtf8> {
266 #[inline]
267 fn from(x: &Wtf8) -> Self {
268 x.to_box()
269 }
270}
271
272impl From<&Wtf8> for Rc<Wtf8> {
273 #[inline]
274 fn from(x: &Wtf8) -> Self {
275 x.to_rc()
276 }
277}
278
279impl From<&Wtf8> for Arc<Wtf8> {
280 #[inline]
281 fn from(x: &Wtf8) -> Self {
282 x.to_arc()
283 }
284}
285
286impl fmt::Debug for Wtf8 {
287 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
288 use core::fmt::Write;
289
290 formatter.write_str("\"")?;
291
292 for c in self.chunks() {
293 match c {
294 Wtf8Chunk::Utf8(c) => {
295 for ch in c.chars().flat_map(|x| x.escape_debug()) {
296 formatter.write_char(ch)?;
297 }
298 }
299 Wtf8Chunk::UnpairedSurrogate(e) => write!(formatter, "\\u{{{:x}}}", e)?,
300 }
301 }
302
303 formatter.write_str("\"")
304 }
305}
306
307impl fmt::Display for Wtf8 {
308 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
309 for chunk in self.chunks() {
310 formatter.write_str(chunk.utf8().unwrap_or("\u{FFFD}"))?;
311 }
312
313 Ok(())
314 }
315}
316
317impl<T: Wtf8Index> Index<T> for Wtf8 {
324 type Output = Wtf8;
325
326 #[inline]
327 fn index(&self, index: T) -> &Wtf8 {
328 match self.get(index.clone()) {
329 Some(x) => x,
330 None => panic!(
331 "index {:?} in `{:?}` do not lie on character boundary",
332 index, self
333 ),
334 }
335 }
336}
337
338impl AsRef<Wtf8> for str {
339 #[inline]
340 fn as_ref(&self) -> &Wtf8 {
341 unsafe { &*(self as *const str as *const Wtf8) }
344 }
345}
346
347#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
349pub struct ToStrError {
350 valid_up_to: usize,
351}
352impl ToStrError {
353 #[inline]
359 pub fn valid_up_to(&self) -> usize {
360 self.valid_up_to
361 }
362
363 #[inline]
371 pub fn error_len(&self) -> usize {
372 3
373 }
374}
375impl fmt::Display for ToStrError {
376 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
377 write!(
378 f,
379 "invalid utf-8 sequence of 3 bytes from index {}",
380 self.valid_up_to
381 )
382 }
383}
384
385pub struct CodePoints<'a> {
387 bytes: slice::Iter<'a, u8>,
388}
389impl Iterator for CodePoints<'_> {
390 type Item = CodePoint;
391
392 #[inline]
393 fn next(&mut self) -> Option<CodePoint> {
394 const CONT_MASK: u8 = 0b0011_1111;
398
399 #[inline]
403 fn utf8_first_byte(byte: u8, width: u32) -> u32 {
404 (byte & (0x7F >> width)) as u32
405 }
406
407 #[inline]
409 fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
410 (ch << 6) | (byte & CONT_MASK) as u32
411 }
412
413 #[inline]
414 fn unwrap_or_0(opt: Option<&u8>) -> u8 {
415 match opt {
416 Some(&byte) => byte,
417 None => 0,
418 }
419 }
420
421 let x = *self.bytes.next()?;
422 if x < 128 {
423 return Some(unsafe { CodePoint::from_u32_unchecked(x as u32) });
425 }
426
427 let init = utf8_first_byte(x, 2);
431 let y = unwrap_or_0(self.bytes.next());
432 let mut ch = utf8_acc_cont_byte(init, y);
433 if x >= 0xE0 {
434 let z = unwrap_or_0(self.bytes.next());
437 let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
438 ch = init << 12 | y_z;
439 if x >= 0xF0 {
440 let w = unwrap_or_0(self.bytes.next());
443 ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
444 }
445 }
446
447 Some(unsafe { CodePoint::from_u32_unchecked(ch) })
449 }
450
451 #[inline]
452 fn size_hint(&self) -> (usize, Option<usize>) {
453 let v = self.bytes.len();
454 (v.saturating_add(3) / 4, Some(v))
455 }
456}
457impl FusedIterator for CodePoints<'_> {}
458
459pub struct EncodeUtf16<'a>(codepoint::EncodeUtf16<CodePoints<'a>>);
461impl Iterator for EncodeUtf16<'_> {
462 type Item = u16;
463
464 #[inline]
465 fn next(&mut self) -> Option<u16> {
466 self.0.next()
467 }
468
469 #[inline]
470 fn size_hint(&self) -> (usize, Option<usize>) {
471 self.0.size_hint()
472 }
473}
474impl FusedIterator for EncodeUtf16<'_> {}
475
476#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
480pub enum Wtf8Chunk<'a> {
481 Utf8(&'a str),
483
484 UnpairedSurrogate(u16),
486}
487
488impl<'a> Wtf8Chunk<'a> {
489 #[inline]
491 pub fn utf8(self) -> Option<&'a str> {
492 match self {
493 Wtf8Chunk::Utf8(a) => Some(a),
494 _ => None,
495 }
496 }
497}
498
499pub struct Chunks<'a>(&'a [u8]);
501impl Chunks<'_> {
502 #[inline]
503 pub(crate) fn next_surrogate(&self) -> Option<usize> {
504 let mut pos = 0;
505 let mut iter = self.0.iter();
506
507 loop {
508 let b = *iter.next()?;
509 if b < 0x80 {
510 pos += 1;
511 } else if b < 0xE0 {
512 iter.next();
513 pos += 2;
514 } else if b == 0xED {
515 match (iter.next(), iter.next()) {
516 (Some(&b2), Some(_)) if b2 >= 0xA0 => {
517 return Some(pos);
518 }
519 _ => pos += 3,
520 }
521 } else if b < 0xF0 {
522 iter.next();
523 iter.next();
524 pos += 3;
525 } else {
526 iter.next();
527 iter.next();
528 iter.next();
529 pos += 4;
530 }
531 }
532 }
533}
534impl<'a> Iterator for Chunks<'a> {
535 type Item = Wtf8Chunk<'a>;
536
537 #[inline]
538 fn next(&mut self) -> Option<Wtf8Chunk<'a>> {
539 match self.next_surrogate() {
540 Some(0) => {
541 let s = decode_surrogate(self.0[1], self.0[2]);
542 self.0 = &self.0[3..];
543 Some(Wtf8Chunk::UnpairedSurrogate(s))
544 }
545
546 Some(x) => {
547 let r = &self.0[..x];
548 self.0 = &self.0[x..];
549 Some(Wtf8Chunk::Utf8(unsafe { str::from_utf8_unchecked(r) }))
551 }
552
553 None if self.0.is_empty() => None,
554
555 None => {
556 let r = self.0;
557 self.0 = &[];
558 Some(Wtf8Chunk::Utf8(unsafe { str::from_utf8_unchecked(r) }))
560 }
561 }
562 }
563}