1use std::convert::AsRef;
23use std::convert::From;
24use std::fmt;
25use std::io::Read;
26use std::iter::Iterator;
27use std::str::FromStr;
28
29macro_rules! impl_eq {
30 ($lhs: ty, $rhs: ty) => {
31 impl PartialEq<$rhs> for $lhs {
32 fn eq(&self, other: &$rhs) -> bool {
33 self.as_str() == other
34 }
35 }
36
37 impl PartialEq<$lhs> for $rhs {
38 fn eq(&self, other: &$lhs) -> bool {
39 other.as_str() == self
40 }
41 }
42 };
43}
44
45#[derive(Clone, Copy, Eq, PartialEq, Debug)]
50pub struct Utf8Char([u8; 4]);
51
52impl Utf8Char {
53 pub fn as_slice(&self) -> &[u8] {
55 match self.0 {
56 [0, 0, 0, 0] | [0, 0, 0, _] => &self.0[3..],
57 [0, 0, _, _] => &self.0[2..],
58 [0, _, _, _] => &self.0[1..],
59 _ => &self.0[..],
60 }
61 }
62
63 pub fn as_str(&self) -> &str {
65 self.as_ref()
66 }
67
68 pub fn is_whitespace(&self) -> bool {
72 match self.0 {
73 [0, 0, 0, 9] | [0, 0, 0, 10] | [0, 0, 0, 13] | [0, 0, 0, 32] => true,
74 _ => false,
75 }
76 }
77
78 pub fn is_ascii_digit(&self) -> bool {
82 match self.0 {
83 [0, 0, 0, v] if v >= b'0' && v <= b'9' => true,
84 _ => false,
85 }
86 }
87
88 pub fn is_alphabetic(&self) -> bool {
90 match self.0 {
91 [0, 0, 0, v] if v >= b'A' && v <= b'Z' || v >= b'a' && v <= b'z' => true,
92 _ => false,
93 }
94 }
95
96 pub fn to_digit(&self) -> Option<u32> {
103 match self.0 {
104 [0, 0, 0, v] if v >= b'0' && v <= b'9' => Some((v - b'0').into()),
105 _ => None,
106 }
107 }
108}
109
110impl From<u8> for Utf8Char {
111 fn from(value: u8) -> Self {
112 Self([0, 0, 0, value])
113 }
114}
115
116impl From<u32> for Utf8Char {
117 fn from(value: u32) -> Self {
118 Self(value.to_be_bytes())
119 }
120}
121
122impl From<char> for Utf8Char {
123 fn from(value: char) -> Self {
124 let mut b = [0; 4];
125 let st = value.encode_utf8(&mut b);
126 let st = st.as_bytes();
127 let mut b = [0; 4];
128 for (i, v) in ((4 - st.len())..4).enumerate() {
129 b[v] = st[i];
130 }
131
132 Self(b)
133 }
134}
135
136impl fmt::Display for Utf8Char {
137 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
138 write!(
139 f,
140 "{}",
141 String::from_str(self.as_str()).expect("cannot convert to a String")
142 )
143 }
144}
145
146impl AsRef<str> for Utf8Char {
147 fn as_ref(&self) -> &str {
148 use std::str;
149 str::from_utf8(self.as_slice())
150 .expect("cannot convert to a str, maybe is not a valid UTF-8 character")
151 }
152}
153
154impl PartialEq<&str> for Utf8Char {
155 fn eq(&self, other: &&str) -> bool {
156 self.as_str() == *other
157 }
158}
159
160impl PartialEq<Utf8Char> for &str {
161 fn eq(&self, other: &Utf8Char) -> bool {
162 other.as_str() == *self
163 }
164}
165
166impl_eq!(Utf8Char, str);
167impl_eq!(&Utf8Char, str);
168
169pub struct Utf8Reader<T: Read>(T);
194
195impl<T: Read> Utf8Reader<T> {
196 pub fn new(inner: T) -> Self {
204 Self(inner)
205 }
206}
207
208impl<T: Read> Iterator for Utf8Reader<T> {
209 type Item = Utf8Char;
210
211 fn next(&mut self) -> Option<Self::Item> {
212 let mut b = [0u8; 1];
213 let size = self.0.read(&mut b).expect("read a byte faied");
214 if size == 0 {
215 return None;
216 }
217
218 let first_byte = b[0];
219 if first_byte < 128 {
220 return Some(first_byte.into());
221 }
222
223 let utf8_32 = match first_byte & 0b11100000 {
224 0b11110000 => exact_next(&mut self.0, 3, first_byte),
225 0b11100000 => exact_next(&mut self.0, 2, first_byte),
226 0b11000000 => exact_next(&mut self.0, 1, first_byte),
227 _ => first_byte as u32,
228 };
229
230 Some(utf8_32.into())
231 }
232}
233
234fn exact_next(read: &mut impl Read, count: usize, first_byte: u8) -> u32 {
235 let mut b = [0u8; 1];
236 let mut res_u32 = first_byte as u32;
237
238 for _ in 0..count {
239 let size = read.read(&mut b).expect("read a byte faied");
240 if size != 0 {
241 res_u32 = res_u32 << 8 | b[0] as u32;
242 }
243 }
244
245 res_u32
246}
247
248#[cfg(test)]
249mod test {
250 use super::*;
251 use std::io::Cursor;
252 use std::io::Write;
253
254 #[test]
255 fn test_whitespace() {
256 let mut buf = Cursor::new(Vec::new());
257 buf.write(" d\t\r\n".as_bytes()).unwrap();
258 buf.set_position(0);
259
260 let mut r = Utf8Reader::new(buf);
261 assert!(r.next().unwrap().is_whitespace());
262 assert!(!r.next().unwrap().is_whitespace());
263 assert!(r.next().unwrap().is_whitespace());
264 assert!(r.next().unwrap().is_whitespace());
265 assert!(r.next().unwrap().is_whitespace());
266 assert!(r.next().is_none());
267 }
268
269 #[test]
270 fn test_digit() {
271 let mut buf = Cursor::new(Vec::new());
272 buf.write("0123456789abi".as_bytes()).unwrap();
273 buf.set_position(0);
274
275 let mut r = Utf8Reader::new(buf);
276 assert!(r.next().unwrap().is_ascii_digit());
277 assert!(r.next().unwrap().is_ascii_digit());
278 assert!(r.next().unwrap().is_ascii_digit());
279 assert!(r.next().unwrap().is_ascii_digit());
280 assert!(r.next().unwrap().is_ascii_digit());
281 assert!(r.next().unwrap().is_ascii_digit());
282 assert!(r.next().unwrap().is_ascii_digit());
283 assert!(r.next().unwrap().is_ascii_digit());
284 assert!(r.next().unwrap().is_ascii_digit());
285 assert!(r.next().unwrap().is_ascii_digit());
286 assert!(!r.next().unwrap().is_ascii_digit());
287 assert!(!r.next().unwrap().is_ascii_digit());
288 assert!(!r.next().unwrap().is_ascii_digit());
289 assert!(r.next().is_none());
290 }
291
292 #[test]
293 fn test_to_digit() {
294 let mut buf = Cursor::new(Vec::new());
295 buf.write("0123456789abi".as_bytes()).unwrap();
296 buf.set_position(0);
297
298 let mut r = Utf8Reader::new(buf);
299 assert_eq!(Some(0), r.next().unwrap().to_digit());
300 assert_eq!(Some(1), r.next().unwrap().to_digit());
301 assert_eq!(Some(2), r.next().unwrap().to_digit());
302 assert_eq!(Some(3), r.next().unwrap().to_digit());
303 assert_eq!(Some(4), r.next().unwrap().to_digit());
304 assert_eq!(Some(5), r.next().unwrap().to_digit());
305 assert_eq!(Some(6), r.next().unwrap().to_digit());
306 assert_eq!(Some(7), r.next().unwrap().to_digit());
307 assert_eq!(Some(8), r.next().unwrap().to_digit());
308 assert_eq!(Some(9), r.next().unwrap().to_digit());
309 assert_eq!(None, r.next().unwrap().to_digit());
310 assert_eq!(None, r.next().unwrap().to_digit());
311 assert_eq!(None, r.next().unwrap().to_digit());
312 assert_eq!(None, r.next());
313 }
314
315 #[test]
316 fn is_alphabetic() {
317 let mut buf = Cursor::new(Vec::new());
318 buf.write("abcdABCDEZz0000".as_bytes()).unwrap();
319 buf.set_position(0);
320
321 let mut r = Utf8Reader::new(buf);
322 assert!(r.next().unwrap().is_alphabetic());
323 assert!(r.next().unwrap().is_alphabetic());
324 assert!(r.next().unwrap().is_alphabetic());
325 assert!(r.next().unwrap().is_alphabetic());
326 assert!(r.next().unwrap().is_alphabetic());
327 assert!(r.next().unwrap().is_alphabetic());
328 assert!(r.next().unwrap().is_alphabetic());
329 assert!(r.next().unwrap().is_alphabetic());
330 assert!(r.next().unwrap().is_alphabetic());
331 assert!(r.next().unwrap().is_alphabetic());
332 assert!(r.next().unwrap().is_alphabetic());
333 assert!(!r.next().unwrap().is_alphabetic());
334 assert!(!r.next().unwrap().is_alphabetic());
335 assert!(!r.next().unwrap().is_alphabetic());
336 assert!(!r.next().unwrap().is_alphabetic());
337 assert!(r.next().is_none());
338 }
339
340 #[test]
341 fn test_display() {
342 let mut buf = Cursor::new(Vec::new());
343 buf.write("复// d".as_bytes()).unwrap();
344 buf.set_position(0);
345
346 let mut r = Utf8Reader::new(buf);
347 assert_eq!("复".to_string(), r.next().unwrap().to_string());
348 assert_eq!("/".to_string(), r.next().unwrap().to_string());
349 }
350
351 #[test]
352 fn test_as_str() {
353 let mut buf = Cursor::new(Vec::new());
354 buf.write("复// d".as_bytes()).unwrap();
355 buf.set_position(0);
356
357 let mut r = Utf8Reader::new(buf);
358 let utf8char = r.next().unwrap();
359 assert_eq!("复", utf8char.as_ref());
360 let utf8char = r.next().unwrap();
361 assert_eq!("/", utf8char.as_ref());
362 let utf8char = r.next().unwrap();
363 assert_eq!("/", utf8char.as_ref());
364 let utf8char = r.next().unwrap();
365 assert_eq!(" ", utf8char.as_ref());
366 let utf8char = r.next().unwrap();
367 assert_eq!("d", utf8char.as_ref());
368 assert_eq!(None, r.next());
369 }
370
371 #[test]
372 fn test_iterator() {
373 let mut buf = Cursor::new(Vec::new());
374 buf.write(
375 r"复// d❤
3761+1=2 // é异"
377 .as_bytes(),
378 )
379 .unwrap();
380 buf.set_position(0);
381
382 let mut r = Utf8Reader::new(buf);
383
384 assert_eq!(Some('复'.into()), r.next());
385 assert_eq!(Some('/'.into()), r.next());
386 assert_eq!(Some('/'.into()), r.next());
387 assert_eq!(Some(' '.into()), r.next());
388 assert_eq!(Some('d'.into()), r.next());
389 assert_eq!(Some('❤'.into()), r.next());
390 assert_eq!(Some('\n'.into()), r.next());
391 assert_eq!(Some('1'.into()), r.next());
392 assert_eq!(Some('+'.into()), r.next());
393 assert_eq!(Some('1'.into()), r.next());
394 assert_eq!(Some('='.into()), r.next());
395 assert_eq!(Some('2'.into()), r.next());
396 assert_eq!(Some(' '.into()), r.next());
397 assert_eq!(Some('/'.into()), r.next());
398 assert_eq!(Some('/'.into()), r.next());
399 assert_eq!(Some(' '.into()), r.next());
400 assert_eq!(Some('é'.into()), r.next());
401 assert_eq!(Some('异'.into()), r.next());
402 assert_eq!(None, r.next());
403 }
404
405 #[test]
406 fn wrong_character() {
407 let mut buf = Cursor::new(Vec::new());
408 buf.write("\u{D7FF}复".as_bytes()).unwrap();
409 buf.set_position(0);
410
411 let mut r = Utf8Reader::new(buf);
412 assert_eq!(Some('\u{D7FF}'.into()), r.next());
413 assert_eq!(Some('复'.into()), r.next());
414 assert_eq!(None, r.next());
415 }
416
417 #[test]
418 fn equal_str() {
419 let mut buf = Cursor::new(Vec::new());
420 buf.write("0a/*-比".as_bytes()).unwrap();
421 buf.set_position(0);
422
423 let mut r = Utf8Reader::new(buf);
424 let v = r.next().unwrap();
425 assert_eq!("0", v);
426 assert_eq!("0", &v);
427 assert_eq!("a", r.next().unwrap());
428 assert_eq!("/", r.next().unwrap());
429 assert_eq!("*", r.next().unwrap());
430 assert_eq!("-", r.next().unwrap());
431 assert_eq!(r.next().unwrap(), "比");
432 assert_eq!(None, r.next());
433 }
434}