1use crate::{decode_surrogate, decode_surrogate_pair, CodePoint, Wtf8};
4use alloc::borrow::ToOwned;
5use alloc::boxed::Box;
6use alloc::string::String;
7use alloc::vec::Vec;
8use core::borrow::{Borrow, BorrowMut};
9use core::convert::Infallible;
10use core::iter::FromIterator;
11use core::ops::{Deref, DerefMut};
12use core::str::FromStr;
13use core::{char, fmt};
14
15#[cfg(test)]
16mod tests;
17
18#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
20pub struct Wtf8Buf {
21 bytes: Vec<u8>,
22}
23
24impl Wtf8Buf {
25 #[inline]
26 pub(crate) fn from_bytes(x: Vec<u8>) -> Wtf8Buf {
27 Self { bytes: x }
28 }
29
30 #[inline]
32 pub const fn new() -> Wtf8Buf {
33 Wtf8Buf { bytes: Vec::new() }
34 }
35
36 #[inline]
38 pub fn with_capacity(capacity: usize) -> Wtf8Buf {
39 Wtf8Buf {
40 bytes: Vec::with_capacity(capacity),
41 }
42 }
43
44 #[inline]
50 pub fn from_string(string: String) -> Wtf8Buf {
51 Wtf8Buf {
52 bytes: string.into_bytes(),
53 }
54 }
55
56 #[inline]
64 pub fn reserve(&mut self, additional: usize) {
65 self.bytes.reserve(additional)
66 }
67
68 #[inline]
81 pub fn reserve_exact(&mut self, additional: usize) {
82 self.bytes.reserve_exact(additional)
83 }
84
85 #[inline]
90 pub fn shrink_to_fit(&mut self) {
91 self.bytes.shrink_to_fit()
92 }
93
94 #[inline]
96 pub fn capacity(&self) -> usize {
97 self.bytes.capacity()
98 }
99
100 #[inline]
106 #[allow(clippy::should_implement_trait)]
107 pub fn from_str(str: &str) -> Wtf8Buf {
108 Wtf8Buf {
109 bytes: <[_]>::to_vec(str.as_bytes()),
110 }
111 }
112
113 #[inline]
115 pub fn clear(&mut self) {
116 self.bytes.clear()
117 }
118
119 pub fn from_utf16<I>(v: I) -> Wtf8Buf
124 where
125 I: IntoIterator<Item = u16>,
126 {
127 let iter = v.into_iter();
128 let mut string = Wtf8Buf::with_capacity(iter.size_hint().0);
129 for item in char::decode_utf16(iter) {
130 match item {
131 Ok(ch) => string.push_char(ch),
132 Err(surrogate) => {
133 let surrogate = surrogate.unpaired_surrogate();
134 let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
136 string.push_code_point_unchecked(code_point)
139 }
140 }
141 }
142 string
143 }
144
145 #[inline]
147 pub fn as_wtf8(&self) -> &Wtf8 {
148 unsafe { &*(self.bytes.as_slice() as *const [u8] as *const Wtf8) }
150 }
151
152 #[inline]
154 pub fn as_mut_wtf8(&mut self) -> &mut Wtf8 {
155 unsafe { &mut *(self.bytes.as_mut_slice() as *mut [u8] as *mut Wtf8) }
157 }
158
159 #[inline]
161 pub fn push_str(&mut self, other: &str) {
162 self.bytes.extend_from_slice(other.as_bytes())
163 }
164
165 #[inline]
171 pub fn push_wtf8(&mut self, other: &Wtf8) {
172 match (
173 (&*self).final_lead_surrogate(),
174 other.initial_trail_surrogate(),
175 ) {
176 (Some(lead), Some(trail)) => {
178 let len_without_lead_surrogate = self.len() - 3;
179 self.bytes.truncate(len_without_lead_surrogate);
180 let other_without_trail_surrogate = &other.bytes()[3..];
181 self.bytes.reserve(4 + other_without_trail_surrogate.len());
183 self.push_char(decode_surrogate_pair(lead, trail));
184 self.bytes.extend_from_slice(other_without_trail_surrogate);
185 }
186 _ => self.bytes.extend_from_slice(other.bytes()),
187 }
188 }
189
190 #[inline]
192 pub fn push_char(&mut self, c: char) {
193 self.push_code_point_unchecked(CodePoint::from_char(c))
194 }
195
196 #[inline]
202 pub fn push(&mut self, code_point: CodePoint) {
203 if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() {
204 if let Some(lead) = self.final_lead_surrogate() {
205 let len_without_lead_surrogate = self.len() - 3;
206 self.bytes.truncate(len_without_lead_surrogate);
207 self.push_char(decode_surrogate_pair(lead, trail as u16));
208 return;
209 }
210 }
211
212 self.push_code_point_unchecked(code_point)
214 }
215
216 #[inline]
223 pub fn truncate(&mut self, new_len: usize) {
224 assert!(self.is_code_point_boundary(new_len));
225 self.bytes.truncate(new_len)
226 }
227
228 pub fn into_string(self) -> Result<String, IntoStringError> {
236 let chunks = self.chunks();
237
238 match chunks.next_surrogate() {
239 Some(position) => Err(IntoStringError {
240 wtf8: self,
241 valid_up_to: position,
242 }),
243 None => unsafe { Ok(String::from_utf8_unchecked(self.bytes)) },
245 }
246 }
247
248 pub fn into_string_lossy(self) -> String {
254 let chunks = self.chunks();
255
256 if chunks.next_surrogate().is_none() {
257 unsafe { String::from_utf8_unchecked(self.bytes) }
259 } else {
260 self.to_string_lossy().into_owned()
261 }
262 }
263
264 #[inline]
266 pub fn into_box(self) -> Box<Wtf8> {
267 unsafe { Box::from_raw(Box::into_raw(self.bytes.into_boxed_slice()) as *mut Wtf8) }
269 }
270
271 pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
273 let bytes: Box<[u8]> = unsafe { Box::from_raw(Box::into_raw(boxed) as *mut [u8]) };
275 Wtf8Buf {
276 bytes: bytes.into_vec(),
277 }
278 }
279
280 #[inline]
281 fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
282 const TAG_CONT: u8 = 0b1000_0000;
283 const TAG_TWO_B: u8 = 0b1100_0000;
284 const TAG_THREE_B: u8 = 0b1110_0000;
285 const TAG_FOUR_B: u8 = 0b1111_0000;
286 const MAX_ONE_B: u32 = 0x80;
287 const MAX_TWO_B: u32 = 0x800;
288 const MAX_THREE_B: u32 = 0x10000;
289
290 #[inline]
291 const fn len_utf8(code: u32) -> usize {
292 if code < MAX_ONE_B {
293 1
294 } else if code < MAX_TWO_B {
295 2
296 } else if code < MAX_THREE_B {
297 3
298 } else {
299 4
300 }
301 }
302
303 #[inline]
304 fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
305 let len = len_utf8(code);
306 #[allow(clippy::redundant_slicing)]
307 match (len, &mut dst[..]) {
308 (1, [a, ..]) => {
309 *a = code as u8;
310 }
311 (2, [a, b, ..]) => {
312 *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
313 *b = (code & 0x3F) as u8 | TAG_CONT;
314 }
315 (3, [a, b, c, ..]) => {
316 *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
317 *b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
318 *c = (code & 0x3F) as u8 | TAG_CONT;
319 }
320 (4, [a, b, c, d, ..]) => {
321 *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
322 *b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
323 *c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
324 *d = (code & 0x3F) as u8 | TAG_CONT;
325 }
326 _ => panic!(
327 "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
328 len,
329 code,
330 dst.len(),
331 ),
332 };
333 &mut dst[..len]
334 }
335
336 let mut bytes = [0; 4];
337 let bytes = encode_utf8_raw(code_point.to_u32(), &mut bytes);
338 self.bytes.extend_from_slice(bytes)
339 }
340
341 #[inline]
342 fn final_lead_surrogate(&self) -> Option<u16> {
343 match self.bytes() {
344 [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(*b2, *b3)),
345 _ => None,
346 }
347 }
348}
349
350impl Deref for Wtf8Buf {
351 type Target = Wtf8;
352 #[inline]
353 fn deref(&self) -> &Wtf8 {
354 self.as_wtf8()
355 }
356}
357
358impl DerefMut for Wtf8Buf {
359 #[inline]
360 fn deref_mut(&mut self) -> &mut Wtf8 {
361 self.as_mut_wtf8()
362 }
363}
364
365impl From<String> for Wtf8Buf {
366 #[inline]
367 fn from(x: String) -> Wtf8Buf {
368 Wtf8Buf::from_string(x)
369 }
370}
371impl From<&str> for Wtf8Buf {
372 #[inline]
373 fn from(x: &str) -> Wtf8Buf {
374 Wtf8Buf::from_str(x)
375 }
376}
377impl From<&Wtf8> for Wtf8Buf {
378 #[inline]
379 fn from(x: &Wtf8) -> Wtf8Buf {
380 x.to_owned()
381 }
382}
383
384impl AsRef<Wtf8> for Wtf8Buf {
385 #[inline]
386 fn as_ref(&self) -> &Wtf8 {
387 self
388 }
389}
390impl Borrow<Wtf8> for Wtf8Buf {
391 #[inline]
392 fn borrow(&self) -> &Wtf8 {
393 self
394 }
395}
396impl AsMut<Wtf8> for Wtf8Buf {
397 #[inline]
398 fn as_mut(&mut self) -> &mut Wtf8 {
399 self
400 }
401}
402impl BorrowMut<Wtf8> for Wtf8Buf {
403 #[inline]
404 fn borrow_mut(&mut self) -> &mut Wtf8 {
405 self
406 }
407}
408
409impl FromStr for Wtf8Buf {
410 type Err = Infallible;
411
412 #[inline]
413 fn from_str(s: &str) -> Result<Self, Infallible> {
414 Ok(Wtf8Buf::from_str(s))
415 }
416}
417
418impl ToOwned for Wtf8 {
419 type Owned = Wtf8Buf;
420
421 #[inline]
422 fn to_owned(&self) -> Wtf8Buf {
423 Wtf8Buf {
424 bytes: self.bytes().to_owned(),
425 }
426 }
427}
428
429impl FromIterator<CodePoint> for Wtf8Buf {
434 fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
435 let mut string = Wtf8Buf::new();
436 string.extend(iter);
437 string
438 }
439}
440
441impl FromIterator<char> for Wtf8Buf {
442 fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> Wtf8Buf {
443 let mut string = Wtf8Buf::new();
444 string.extend(iter);
445 string
446 }
447}
448
449impl<'a> FromIterator<&'a Wtf8> for Wtf8Buf {
450 fn from_iter<T: IntoIterator<Item = &'a Wtf8>>(iter: T) -> Wtf8Buf {
451 let mut string = Wtf8Buf::new();
452 string.extend(iter);
453 string
454 }
455}
456
457impl<'a> FromIterator<&'a str> for Wtf8Buf {
458 fn from_iter<T: IntoIterator<Item = &'a str>>(iter: T) -> Wtf8Buf {
459 let mut string = Wtf8Buf::new();
460 string.extend(iter);
461 string
462 }
463}
464
465impl<'a> FromIterator<&'a CodePoint> for Wtf8Buf {
466 fn from_iter<T: IntoIterator<Item = &'a CodePoint>>(iter: T) -> Wtf8Buf {
467 let mut string = Wtf8Buf::new();
468 string.extend(iter);
469 string
470 }
471}
472
473impl<'a> FromIterator<&'a char> for Wtf8Buf {
474 fn from_iter<T: IntoIterator<Item = &'a char>>(iter: T) -> Wtf8Buf {
475 let mut string = Wtf8Buf::new();
476 string.extend(iter);
477 string
478 }
479}
480
481impl Extend<CodePoint> for Wtf8Buf {
486 fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
487 let iterator = iter.into_iter();
488 let (low, _high) = iterator.size_hint();
489 self.bytes.reserve(low);
491 for code_point in iterator {
492 self.push(code_point);
493 }
494 }
495}
496
497impl Extend<char> for Wtf8Buf {
498 fn extend<T: IntoIterator<Item = char>>(&mut self, iter: T) {
499 let iterator = iter.into_iter();
500 let (low, _high) = iterator.size_hint();
501 self.bytes.reserve(low);
502 for c in iterator {
503 self.push_char(c);
504 }
505 }
506}
507
508impl<'a> Extend<&'a str> for Wtf8Buf {
509 fn extend<T: IntoIterator<Item = &'a str>>(&mut self, iter: T) {
510 let iterator = iter.into_iter();
511 let (low, _high) = iterator.size_hint();
512 self.bytes.reserve(low);
513 for c in iterator {
514 self.push_str(c);
515 }
516 }
517}
518
519impl<'a> Extend<&'a Wtf8> for Wtf8Buf {
520 fn extend<T: IntoIterator<Item = &'a Wtf8>>(&mut self, iter: T) {
521 let iterator = iter.into_iter();
522 let (low, _high) = iterator.size_hint();
523 self.bytes.reserve(low);
524 for c in iterator {
525 self.push_wtf8(c);
526 }
527 }
528}
529
530impl<'a> Extend<&'a CodePoint> for Wtf8Buf {
531 #[inline]
532 fn extend<T: IntoIterator<Item = &'a CodePoint>>(&mut self, iter: T) {
533 self.extend(iter.into_iter().copied())
534 }
535}
536
537impl<'a> Extend<&'a char> for Wtf8Buf {
538 #[inline]
539 fn extend<T: IntoIterator<Item = &'a char>>(&mut self, iter: T) {
540 self.extend(iter.into_iter().copied())
541 }
542}
543
544impl fmt::Debug for Wtf8Buf {
545 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
546 fmt::Debug::fmt(self.as_wtf8(), f)
547 }
548}
549
550impl fmt::Display for Wtf8Buf {
551 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
552 fmt::Display::fmt(self.as_wtf8(), f)
553 }
554}
555
556#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
558pub struct IntoStringError {
559 pub wtf8: Wtf8Buf,
560 valid_up_to: usize,
561}
562impl IntoStringError {
563 #[inline]
569 pub fn valid_up_to(&self) -> usize {
570 self.valid_up_to
571 }
572
573 #[inline]
581 pub fn error_len(&self) -> usize {
582 3
583 }
584}
585impl fmt::Display for IntoStringError {
586 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
587 write!(
588 f,
589 "invalid utf-8 sequence of 3 bytes from index {}",
590 self.valid_up_to
591 )
592 }
593}