binator/base/
utf8.rs

1use crate::{
2  base::{
3    octet,
4    BaseAtom,
5  },
6  utils::Utils,
7  Contexting,
8  CoreAtom,
9  Parse,
10  Parsed,
11  Streaming,
12  Success,
13};
14
15fn raw<Stream, Context>(stream: Stream) -> Parsed<u32, Stream, Context>
16where
17  Stream: Streaming,
18  Context: Contexting<CoreAtom<Stream>>,
19  Context: Contexting<BaseAtom<u8>>,
20  Stream::Item: Into<u8>,
21{
22  let Success { token: a, stream } = octet.parse(stream)?;
23  if a & 0x80 == 0 {
24    Parsed::Success {
25      token: a as u32,
26      stream,
27    }
28  } else if a & 0xE0 == 0xC0 {
29    let Success { token: b, stream } = octet.parse(stream)?;
30
31    Parsed::Success {
32      token: (a as u32 & 0x1F) << 6 | (b as u32 & 0x3F),
33      stream,
34    }
35  } else if a & 0xF0 == 0xE0 {
36    let Success { token: b, stream } = octet.parse(stream)?;
37    let Success { token: c, stream } = octet.parse(stream)?;
38
39    Parsed::Success {
40      token: (a as u32 & 0x0F) << 12 | (b as u32 & 0x3F) << 6 | (c as u32 & 0x3F),
41      stream,
42    }
43  } else if a & 0xF8 == 0xF0 {
44    let Success { token: b, stream } = octet.parse(stream)?;
45    let Success { token: c, stream } = octet.parse(stream)?;
46    let Success { token: d, stream } = octet.parse(stream)?;
47
48    Parsed::Success {
49      token: (a as u32 & 0x07) << 18
50        | (b as u32 & 0x3F) << 12
51        | (c as u32 & 0x3F) << 6
52        | (d as u32 & 0x3F),
53      stream,
54    }
55  } else {
56    Parsed::Failure(Contexting::new(BaseAtom::Utf8 {}))
57  }
58}
59
60/// Parser that will read stream and return valid utf8 char
61/// If you are expecting utf8 you MUST use this Parser, and not character
62/// Parser.
63#[cfg_attr(
64  feature = "tracing",
65  tracing::instrument(level = "trace", skip_all, ret(Display))
66)]
67pub fn utf8<Stream, Context>(stream: Stream) -> Parsed<char, Stream, Context>
68where
69  Stream: Streaming,
70  Context: Contexting<CoreAtom<Stream>>,
71  Context: Contexting<BaseAtom<u8>>,
72  Stream::Item: Into<u8>,
73{
74  raw
75    .try_map(|raw| char::from_u32(raw).ok_or_else(|| Contexting::new(BaseAtom::Utf8 {})))
76    .parse(stream)
77}
78
79#[cfg(test)]
80mod tests {
81  use crate::{
82    context::Ignore,
83    Parsed,
84  };
85
86  #[test]
87  fn utf8() {
88    println!("{}", "❤".len());
89    assert_eq!(
90      super::utf8::<_, Ignore>("❤".as_bytes()),
91      Parsed::Success {
92        token: '❤',
93        stream: "".as_bytes(),
94      }
95    );
96  }
97}