somen_decode/
utf8.rs

1use somen::prelude::*;
2
3/// A UTF-8 encoded [`u8`] decoder.
4///
5/// # Examples
6/// ```
7/// # futures_executor::block_on(async {
8/// # use somen_decode::utf8;
9/// use somen::prelude::*;
10///
11/// let mut parser = utf8();
12/// let mut stream = stream::from_slice(b"A\xC3\x85\xE3\x81\x82\xF0\x9F\x92\xAF\xC0\xAF");
13///
14/// assert_eq!(parser.parse(&mut stream).await, Ok('A'));
15/// assert_eq!(parser.parse(&mut stream).await, Ok('Å'));
16/// assert_eq!(parser.parse(&mut stream).await, Ok('あ'));
17/// assert_eq!(parser.parse(&mut stream).await, Ok('💯'));
18/// assert!(parser.parse(&mut stream).await.is_err());
19/// # });
20/// ```
21pub fn utf8<'a, I>() -> impl Parser<I, Output = char>
22where
23    I: Positioned<Ok = u8> + ?Sized + 'a,
24{
25    // First byte
26    is_some(|b1| match b1 {
27        0x00..=0x7F => Some(Ok(b1 as u32)),
28        0xC2..=0xF4 => Some(Err(b1)),
29        _ => None,
30    })
31    // Second byte
32    .then(|res| match res {
33        Ok(c) => value(Ok(c)).left(),
34        Err(b1) => is_some(move |b2| {
35            if b2 & 0xC0 != 0x80
36                || (b1 == 0xE0 && b2 < 0xA0)
37                || (b1 == 0xED && b2 >= 0xA0)
38                || (b1 == 0xF0 && b2 < 0x90)
39                || (b1 == 0xF4 && b2 >= 0x90)
40            {
41                None
42            } else if b1 & 0xE0 == 0xC0 {
43                Some(Ok(((b1 & 0x1F) as u32) << 6 | (b2 & 0x3F) as u32))
44            } else {
45                Some(Err((b1, b2)))
46            }
47        })
48        .right(),
49    })
50    // Third byte
51    .then(|res| match res {
52        Ok(c) => value(Ok(c)).left(),
53        Err((b1, b2)) => is_some(move |b3| {
54            if b3 & 0xC0 != 0x80 {
55                None
56            } else if b1 & 0xF0 == 0xE0 {
57                Some(Ok(((b1 & 0x0F) as u32) << 12
58                    | ((b2 & 0x3F) as u32) << 6
59                    | (b3 & 0x3F) as u32))
60            } else {
61                Some(Err((b1, b2, b3)))
62            }
63        })
64        .right(),
65    })
66    // Last byte
67    .then(|res| match res {
68        Ok(c) => value(c).left(),
69        Err((b1, b2, b3)) => is_some(move |b4| {
70            if b4 & 0xC0 != 0x80 {
71                None
72            } else {
73                Some(
74                    ((b1 & 0x07) as u32) << 18
75                        | ((b2 & 0x3F) as u32) << 12
76                        | ((b3 & 0x3F) as u32) << 6
77                        | (b4 & 0x3F) as u32,
78                )
79            }
80        })
81        .right(),
82    })
83    .map(|c| unsafe { char::from_u32_unchecked(c) })
84    .expect("UTF-8 character")
85}