1use crate::encoding::sealed::Sealed;
2use crate::encoding::{Encoding, NullTerminable, ValidateError};
3use crate::str::Str;
4use arrayvec::ArrayVec;
5#[cfg(feature = "rand")]
6use rand::{distr::Distribution, Rng};
7
8#[non_exhaustive]
10#[derive(Default)]
11pub struct Utf8;
12
13impl Sealed for Utf8 {}
14
15impl Encoding for Utf8 {
16 const REPLACEMENT: char = '\u{FFFD}';
17 const MAX_LEN: usize = 4;
18 type Bytes = ArrayVec<u8, 4>;
19
20 fn shorthand() -> &'static str {
21 "utf8"
22 }
23
24 fn validate(bytes: &[u8]) -> Result<(), ValidateError> {
25 core::str::from_utf8(bytes)
26 .map(|_| ())
27 .map_err(|e| ValidateError {
28 valid_up_to: e.valid_up_to(),
29 error_len: e.error_len().map(|e| e as u8),
30 })
31 }
32
33 fn encode_char(c: char) -> Option<Self::Bytes> {
34 let mut out = [0; 4];
35 let res = c.encode_utf8(&mut out);
36 let mut out = ArrayVec::new();
37 out.extend(res.as_bytes().iter().copied());
38 Some(out)
39 }
40
41 fn decode_char(str: &Str<Self>) -> (char, &Str<Self>) {
42 let c = str.as_std().chars().next().unwrap();
43 (c, &str[c.len_utf8()..])
44 }
45
46 fn char_bound(str: &Str<Self>, idx: usize) -> bool {
47 str.as_std().is_char_boundary(idx)
48 }
49
50 fn char_len(c: char) -> usize {
51 c.len_utf8()
52 }
53}
54
55impl NullTerminable for Utf8 {}
56
57#[cfg(feature = "rand")]
58impl Distribution<char> for Utf8 {
59 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
60 rng.random::<char>()
61 }
62}
63
64pub type Utf16 = Utf16LE;
66
67#[derive(PartialEq, Eq)]
68enum Kind {
69 Char,
70 High,
71 Low,
72}
73
74impl Kind {
75 fn of(c: u16) -> Kind {
76 match c {
77 ..=0xD7FF => Kind::Char,
78 0xD800..=0xDBFF => Kind::High,
79 0xDC00..=0xDFFF => Kind::Low,
80 0xE000.. => Kind::Char,
81 }
82 }
83}
84
85macro_rules! utf16_impl {
86 (
87 $name:ident,
88 $shorthand:literal,
89 $method_from:ident,
90 $method_to:ident,
91 $idx_add:literal,
92 $docname:literal,
93 ) => {
94 #[doc = "The ["]
95 #[doc = $docname]
96 #[doc = "](https://en.wikipedia.org/wiki/UTF-16#Byte-order_encoding_schemes) encoding"]
97 #[non_exhaustive]
98 #[derive(Default)]
99 pub struct $name;
100
101 impl Sealed for $name {}
102
103 impl Encoding for $name {
104 const REPLACEMENT: char = '\u{FFFD}';
105 const MAX_LEN: usize = 4;
106 type Bytes = ArrayVec<u8, 4>;
107
108 fn shorthand() -> &'static str {
109 $shorthand
110 }
111
112 fn validate(bytes: &[u8]) -> Result<(), ValidateError> {
113 let chunks = bytes.chunks_exact(2);
114
115 let error = if let [_] = chunks.remainder() {
116 Some(ValidateError {
117 valid_up_to: bytes.len() - 1,
118 error_len: None,
119 })
120 } else {
121 None
122 };
123
124 let mut surrogate = false;
127 for (idx, chunk) in chunks.enumerate() {
128 let c = u16::$method_from([chunk[0], chunk[1]]);
129 let kind = Kind::of(c);
130
131 if !surrogate && kind == Kind::High {
132 surrogate = true;
133 } else if surrogate && kind == Kind::Low {
134 surrogate = false;
135 } else if surrogate || kind != Kind::Char {
136 let err_len = if surrogate && kind != Kind::Char {
137 4
138 } else {
139 2
140 };
141 let idx = if surrogate { idx - 1 } else { idx };
142 return Err(ValidateError {
143 valid_up_to: idx * 2,
144 error_len: Some(err_len),
145 });
146 }
147 }
148
149 if surrogate {
150 return Err(ValidateError {
151 valid_up_to: bytes.len() - 2,
152 error_len: None,
153 });
154 }
155
156 match error {
157 Some(err) => Err(err),
158 None => Ok(()),
159 }
160 }
161
162 fn encode_char(c: char) -> Option<Self::Bytes> {
163 let mut out = [0; 2];
164 let res = c.encode_utf16(&mut out);
165 let mut out = ArrayVec::new();
166 out.extend(res[0].$method_to());
167 if res.len() > 1 {
168 out.extend(res[1].$method_to());
169 }
170 Some(out)
171 }
172
173 fn decode_char(str: &Str<Self>) -> (char, &Str<Self>) {
174 let bytes = str.as_bytes();
175 let high = u16::$method_from([bytes[0], bytes[1]]);
176 if (..0xD800).contains(&high) || (0xE000..).contains(&high) {
177 let c = unsafe { char::from_u32_unchecked(high as u32) };
180 (c, &str[2..])
181 } else {
182 let low = u16::$method_from([bytes[2], bytes[3]]);
183
184 let high = (high as u32 - 0xD800) * 0x400;
185 let low = low as u32 - 0xDC00;
186 let c = unsafe { char::from_u32_unchecked(high + low + 0x10000) };
188 (c, &str[4..])
189 }
190 }
191
192 fn char_bound(str: &Str<Self>, idx: usize) -> bool {
193 idx % 2 == 0 && !(0xDC..0xE0).contains(&str.as_bytes()[idx + $idx_add])
194 }
195
196 fn char_len(c: char) -> usize {
197 c.len_utf16()
198 }
199 }
200
201 #[cfg(feature = "rand")]
202 impl Distribution<char> for $name {
203 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
204 rng.random::<char>()
205 }
206 }
207 };
208}
209
210utf16_impl!(
211 Utf16BE,
212 "utf16be",
213 from_be_bytes,
214 to_be_bytes,
215 0,
216 "UTF-16BE",
217);
218
219utf16_impl!(
220 Utf16LE,
221 "utf16le",
222 from_le_bytes,
223 to_le_bytes,
224 1,
225 "UTF-16LE",
226);
227
228macro_rules! utf32_impl {
229 (
230 $name:ident,
231 $shorthand:literal,
232 $method_from:ident,
233 $method_to:ident,
234 $docname:literal,
235 ) => {
236 #[doc = "The ["]
237 #[doc = $docname]
238 #[doc = "](https://en.wikipedia.org/wiki/UTF-32) encoding"]
239 #[non_exhaustive]
240 #[derive(Default)]
241 pub struct $name;
242
243 impl Sealed for $name {}
244
245 impl Encoding for $name {
246 const REPLACEMENT: char = '\u{FFFD}';
247 const MAX_LEN: usize = 4;
248 type Bytes = [u8; 4];
249
250 fn shorthand() -> &'static str {
251 $shorthand
252 }
253
254 fn validate(bytes: &[u8]) -> Result<(), ValidateError> {
255 for (idx, chunk) in bytes.chunks(4).enumerate() {
256 if chunk.len() != 4 {
257 return Err(ValidateError {
258 valid_up_to: idx * 4,
259 error_len: None,
260 });
261 }
262
263 let c = u32::$method_from([chunk[0], chunk[1], chunk[2], chunk[3]]);
264 if (0xD800..0xE000).contains(&c) || (0x0011_0000..).contains(&c) {
265 return Err(ValidateError {
266 valid_up_to: idx * 4,
267 error_len: Some(4),
268 });
269 }
270 }
271
272 Ok(())
273 }
274
275 fn encode_char(c: char) -> Option<Self::Bytes> {
276 Some((c as u32).$method_to())
277 }
278
279 fn decode_char(str: &Str<Self>) -> (char, &Str<Self>) {
280 let bytes = str.as_bytes();
281 let c = u32::$method_from([bytes[0], bytes[1], bytes[2], bytes[3]]);
282 let c = unsafe { char::from_u32_unchecked(c) };
284 (c, &str[4..])
285 }
286
287 fn char_bound(_: &Str<Self>, idx: usize) -> bool {
288 idx % 4 == 0
289 }
290
291 fn char_len(_: char) -> usize {
292 4
293 }
294 }
295
296 #[cfg(feature = "rand")]
297 impl Distribution<char> for $name {
298 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
299 rng.random::<char>()
300 }
301 }
302 };
303}
304
305utf32_impl!(Utf32BE, "utf32be", from_be_bytes, to_be_bytes, "UTF-32BE",);
306utf32_impl!(Utf32LE, "utf32le", from_le_bytes, to_le_bytes, "UTF-32LE",);
307
308#[cfg(target_endian = "little")]
310pub type Utf32 = Utf32LE;
311
312#[cfg(target_endian = "big")]
314pub type Utf32 = Utf32BE;
315
316#[cfg(test)]
317mod tests {
318 use super::*;
319 use alloc::vec::Vec;
320
321 extern crate alloc;
322
323 #[allow(clippy::octal_escapes)]
324 #[test]
325 fn test_validate_utf16_le() {
326 assert!(Utf16LE::validate(b"a\0b\0c\01\02\03\0").is_ok());
327 assert!(Utf16LE::validate(b"A\0 \0y\0e\0e\0:\0 \0\x01\xD8\x37\xDC").is_ok());
328 assert_eq!(
330 Utf16LE::validate(b"\x01\xD8a\0"),
331 Err(ValidateError {
332 valid_up_to: 0,
333 error_len: Some(2),
334 })
335 );
336 assert_eq!(
338 Utf16LE::validate(b" \0\x01\xD8\x01\xD8"),
339 Err(ValidateError {
340 valid_up_to: 2,
341 error_len: Some(4),
342 })
343 );
344 assert_eq!(
346 Utf16LE::validate(b"\x01\xD8"),
347 Err(ValidateError {
348 valid_up_to: 0,
349 error_len: None,
350 })
351 );
352 assert_eq!(
354 Utf16LE::validate(b"a\0b\0\x01\xD8"),
355 Err(ValidateError {
356 valid_up_to: 4,
357 error_len: None,
358 })
359 );
360 }
361
362 #[test]
363 fn test_encode_utf16_le() {
364 let mut expect = ArrayVec::new();
365 expect.extend([b'A', 0]);
366 assert_eq!(Utf16LE::encode_char('A'), Some(expect));
367 assert_eq!(
368 Utf16LE::encode_char('𐐷'),
369 Some(ArrayVec::from([0x01, 0xD8, 0x37, 0xDC]))
370 );
371 }
372
373 #[test]
374 fn test_decode_utf16_le() {
375 let str = unsafe { Str::from_bytes_unchecked(b"A\0\x01\xD8\x37\xDCb\0") };
377 let (c, str) = Utf16LE::decode_char(str);
378 assert_eq!(c, 'A');
379 let (c, str) = Utf16LE::decode_char(str);
380 assert_eq!(c, '𐐷');
381 let (c, _) = Utf16LE::decode_char(str);
382 assert_eq!(c, 'b');
383 }
384
385 #[test]
386 fn test_char_boundary_utf16le() {
387 let str = unsafe { Str::from_bytes_unchecked(b"A\0\x01\xD8\x37\xDCb\0") };
388 assert!(Utf16LE::char_bound(str, 2));
389 assert!(!Utf16LE::char_bound(str, 4));
390 assert!(Utf16LE::char_bound(str, 6));
391
392 let str =
393 unsafe { Str::from_bytes_unchecked(&[174, 95, 223, 142, 99, 107, 209, 158, 212, 154]) };
394 assert!(!Utf16LE::char_bound(str, 1));
395 assert!(Utf16LE::char_bound(str, 2));
396 assert!(!Utf16LE::char_bound(str, 3));
397 assert!(Utf16LE::char_bound(str, 4));
398 }
399
400 #[allow(clippy::octal_escapes)]
401 #[test]
402 fn test_validate_utf16_be() {
403 assert!(Utf16BE::validate(b"\0a\0b\0c\01\02\03").is_ok());
404 assert!(Utf16BE::validate(b"\0A\0 \0y\0e\0e\0:\0 \xD8\x01\xDC\x37").is_ok());
405 assert_eq!(
407 Utf16BE::validate(b"\xD8\x01\0a"),
408 Err(ValidateError {
409 valid_up_to: 0,
410 error_len: Some(2),
411 })
412 );
413 assert_eq!(
415 Utf16BE::validate(b"\0 \xD8\x01\xD8\x01"),
416 Err(ValidateError {
417 valid_up_to: 2,
418 error_len: Some(4),
419 })
420 );
421 assert_eq!(
423 Utf16BE::validate(b"\xD8\x01"),
424 Err(ValidateError {
425 valid_up_to: 0,
426 error_len: None,
427 })
428 );
429 assert_eq!(
431 Utf16BE::validate(b"\0a\0b\xD8\x01"),
432 Err(ValidateError {
433 valid_up_to: 4,
434 error_len: None,
435 })
436 );
437 }
438
439 #[test]
440 fn test_encode_utf16_be() {
441 let mut expect = ArrayVec::new();
442 expect.extend([0, b'A']);
443 assert_eq!(Utf16BE::encode_char('A'), Some(expect));
444 assert_eq!(
445 Utf16BE::encode_char('𐐷'),
446 Some(ArrayVec::from([0xD8, 0x01, 0xDC, 0x37]))
447 );
448 }
449
450 #[test]
451 fn test_decode_utf16_be() {
452 let str = unsafe { Str::from_bytes_unchecked(b"\0A\xD8\x01\xDC\x37\0b") };
454 let (c, str) = Utf16BE::decode_char(str);
455 assert_eq!(c, 'A');
456 let (c, str) = Utf16BE::decode_char(str);
457 assert_eq!(c, '𐐷');
458 let (c, _) = Utf16BE::decode_char(str);
459 assert_eq!(c, 'b');
460 }
461
462 #[test]
463 fn test_char_boundary_utf16be() {
464 let str = unsafe { Str::from_bytes_unchecked(b"\0A\xD8\x01\xDC\x37\0b") };
465 assert!(Utf16BE::char_bound(str, 2));
466 assert!(!Utf16BE::char_bound(str, 4));
467 assert!(Utf16BE::char_bound(str, 6));
468
469 let str =
470 unsafe { Str::from_bytes_unchecked(&[95, 174, 142, 223, 107, 99, 158, 209, 154, 212]) };
471 assert!(!Utf16BE::char_bound(str, 1));
472 assert!(Utf16BE::char_bound(str, 2));
473 assert!(!Utf16BE::char_bound(str, 3));
474 assert!(Utf16BE::char_bound(str, 4));
475 }
476
477 macro_rules! utf32le {
478 ($str:literal) => {
479 $str.chars()
480 .flat_map(|c| (c as u32).to_le_bytes())
481 .collect::<Vec<_>>()
482 };
483 }
484
485 #[test]
486 fn test_validate_utf32_le() {
487 assert!(Utf32LE::validate(&utf32le!("abc123")).is_ok());
488 assert!(Utf32LE::validate(&utf32le!("A yee: 𐐷")).is_ok());
489 assert_eq!(
491 Utf32LE::validate(&[
492 0x61, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
493 ]),
494 Err(ValidateError {
495 valid_up_to: 4,
496 error_len: Some(4),
497 })
498 );
499 assert_eq!(
500 Utf32LE::validate(&[0x00, 0x00, 0x11, 0x00]),
501 Err(ValidateError {
502 valid_up_to: 0,
503 error_len: Some(4),
504 })
505 );
506 }
507
508 #[test]
509 fn test_encode_utf32_le() {
510 assert_eq!(Utf32LE::encode_char('A'), Some([b'A', 0, 0, 0]));
511 assert_eq!(Utf32LE::encode_char('𐐷'), Some([0x37, 0x04, 0x01, 0x00]));
512 }
513
514 #[test]
515 fn test_decode_utf32_le() {
516 let bytes = utf32le!("A𐐷b");
517 let str = Str::from_bytes(&bytes).unwrap();
518 let (c, str) = Utf32LE::decode_char(str);
519 assert_eq!(c, 'A');
520 let (c, str) = Utf32LE::decode_char(str);
521 assert_eq!(c, '𐐷');
522 let (c, _) = Utf32LE::decode_char(str);
523 assert_eq!(c, 'b');
524 }
525
526 macro_rules! utf32be {
527 ($str:literal) => {
528 $str.chars()
529 .flat_map(|c| (c as u32).to_be_bytes())
530 .collect::<Vec<_>>()
531 };
532 }
533
534 #[test]
535 fn test_validate_utf32_be() {
536 assert!(Utf32BE::validate(&utf32be!("abc123")).is_ok());
537 assert!(Utf32BE::validate(&utf32be!("A yee: 𐐷")).is_ok());
538 assert_eq!(
540 Utf32BE::validate(&[
541 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x62,
542 ]),
543 Err(ValidateError {
544 valid_up_to: 4,
545 error_len: Some(4),
546 })
547 );
548 assert_eq!(
549 Utf32BE::validate(&[0x00, 0x11, 0x00, 0x00]),
550 Err(ValidateError {
551 valid_up_to: 0,
552 error_len: Some(4),
553 })
554 );
555 }
556
557 #[test]
558 fn test_encode_utf32_be() {
559 assert_eq!(Utf32BE::encode_char('A'), Some([0, 0, 0, b'A']));
560 assert_eq!(Utf32BE::encode_char('𐐷'), Some([0x00, 0x01, 0x04, 0x37]));
561 }
562
563 #[test]
564 fn test_decode_utf32_be() {
565 let bytes = utf32be!("A𐐷b");
566 let str = Str::from_bytes(&bytes).unwrap();
567 let (c, str) = Utf32BE::decode_char(str);
568 assert_eq!(c, 'A');
569 let (c, str) = Utf32BE::decode_char(str);
570 assert_eq!(c, '𐐷');
571 let (c, _) = Utf32BE::decode_char(str);
572 assert_eq!(c, 'b');
573 }
574}