1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#![no_std]
#[cfg(test)]
extern crate std;
use core::{char::*, iter, str};
#[derive(Clone, Debug)]
pub struct DecodeUtf8<I: Iterator<Item = u8>>(iter::Peekable<I>);
#[inline]
pub fn decode_utf8<I: IntoIterator<Item = u8>>(i: I) -> DecodeUtf8<I::IntoIter> {
DecodeUtf8(i.into_iter().peekable())
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub struct InvalidSequence(());
impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
type Item = Result<char, InvalidSequence>;
#[inline]
fn next(&mut self) -> Option<Result<char, InvalidSequence>> {
self.0.next().map(|b| {
if b & 0x80 == 0 { Ok(b as char) } else {
let l = (!b).leading_zeros() as usize;
if l < 2 || l > 6 { return Err(InvalidSequence(())) };
let mut x = (b as u32) & (0x7F >> l);
for _ in 0..l-1 {
match self.0.peek() {
Some(&b) if b & 0xC0 == 0x80 => {
self.0.next();
x = (x << 6) | (b as u32) & 0x3F;
},
_ => return Err(InvalidSequence(())),
}
}
match from_u32(x) {
Some(x) if l == x.len_utf8() => Ok(x),
_ => Err(InvalidSequence(())),
}
}
})
}
}
mod private {
pub trait UtfExtSealed {}
}
use private::*;
pub trait UtfExt: UtfExtSealed {
type UtfSlice: ?Sized;
fn try_encode_utf8(self, bs: &mut [u8]) -> Option<&mut Self::UtfSlice>;
}
impl UtfExtSealed for char {}
impl UtfExtSealed for u32 {}
impl UtfExt for char {
type UtfSlice = str;
#[inline]
fn try_encode_utf8(self, bs: &mut [u8]) -> Option<&mut str> {
(self as u32).try_encode_utf8(bs).map(|bs| unsafe { str::from_utf8_unchecked_mut(bs) })
}
}
impl UtfExt for u32 {
type UtfSlice = [u8];
fn try_encode_utf8(mut self, bs: &mut [u8]) -> Option<&mut [u8]> {
static ls: [Fin7; 33] = [F0, F6, F6, F6, F6, F6, F5, F5,
F5, F5, F5, F4, F4, F4, F4, F4,
F3, F3, F3, F3, F3, F2, F2, F2,
F2, F1, F1, F1, F1, F1, F1, F1, F1];
let l = ls[self.leading_zeros() as usize] as usize;
let first = !(!0u8 >> l);
{
let (b0, bs) = bs.get_mut(0..l)?.split_first_mut()?;
for b in bs.iter_mut().rev() {
*b = self as u8 & 0x3F | 0x80;
self >>= 6;
}
*b0 = self as u8 | first;
}
Some(bs)
}
}
#[derive(Clone, Copy)]
#[repr(u8)]
enum Fin7 { F0 = 0, F1 = 1, F2 = 2, F3 = 3, F4 = 4, F5 = 5, F6 = 6 }
use self::Fin7::*;
#[test]
fn test() {
use std::vec::Vec;
use std::iter::FromIterator;
for &(str, bs) in [("", &[] as &[u8]),
("A", &[0x41u8] as &[u8]),
("�", &[0xC1u8, 0x81u8] as &[u8]),
("♥", &[0xE2u8, 0x99u8, 0xA5u8]),
("♥A", &[0xE2u8, 0x99u8, 0xA5u8, 0x41u8] as &[u8]),
("�", &[0xE2u8, 0x99u8] as &[u8]),
("�A", &[0xE2u8, 0x99u8, 0x41u8] as &[u8]),
("�", &[0xC0u8] as &[u8]),
("�A", &[0xC0u8, 0x41u8] as &[u8]),
("�", &[0x80u8] as &[u8]),
("�A", &[0x80u8, 0x41u8] as &[u8]),
("�", &[0xFEu8] as &[u8]),
("�A", &[0xFEu8, 0x41u8] as &[u8]),
("�", &[0xFFu8] as &[u8]),
("�A", &[0xFFu8, 0x41u8] as &[u8])].into_iter() {
assert!(Iterator::eq(str.chars(),
decode_utf8(bs.into_iter().cloned())
.map(|r_b| r_b.unwrap_or('\u{FFFD}'))),
"chars = {}, bytes = {:?}, decoded = {:?}", str, bs,
Vec::from_iter(decode_utf8(bs.into_iter().cloned())
.map(|r_b| r_b.unwrap_or('\u{FFFD}'))));
}
}