somen_decode/utf16.rs
1use somen::prelude::*;
2
3// A base decoder
4fn decode_one(b: u16) -> Option<Result<char, u16>> {
5 if b & 0xFC00 == 0xDC00 {
6 None
7 } else if b & 0xFC00 == 0xD800 {
8 Some(Err(b))
9 } else {
10 Some(Ok(unsafe { char::from_u32_unchecked(b as u32) }))
11 }
12}
13
14fn decode_two(b1: u16, b2: u16) -> Option<char> {
15 if b2 & 0xFC00 == 0xDC00 {
16 let cp =
17 (((b1 & 0x3C0) + 1) as u32) << 16 | ((b1 & 0x3F) as u32) << 10 | (b2 & 0x3FF) as u32;
18 Some(unsafe { char::from_u32_unchecked(cp) })
19 } else {
20 None
21 }
22}
23
24/// A UTF-16 encoded [`u16`] decoder.
25///
26/// # Examples
27/// ```
28/// # futures_executor::block_on(async {
29/// # use somen_decode::utf16;
30/// use somen::prelude::*;
31///
32/// let mut parser = utf16();
33/// let mut stream = stream::from_slice(&[
34/// 0xD834, 0xDD1E, 0x004d, 0x0075, 0x0073, 0x0069, 0x0063, 0xD834,
35/// ]);
36///
37/// assert_eq!(parser.parse(&mut stream).await, Ok('𝄞'));
38/// assert_eq!(parser.parse(&mut stream).await, Ok('M'));
39/// assert_eq!(parser.parse(&mut stream).await, Ok('u'));
40/// assert_eq!(parser.parse(&mut stream).await, Ok('s'));
41/// assert_eq!(parser.parse(&mut stream).await, Ok('i'));
42/// assert_eq!(parser.parse(&mut stream).await, Ok('c'));
43/// assert!(parser.parse(&mut stream).await.is_err());
44/// # });
45/// ```
46pub fn utf16<'a, I>() -> impl Parser<I, Output = char>
47where
48 I: Positioned<Ok = u16> + ?Sized + 'a,
49{
50 is_some(decode_one)
51 .then(|res| match res {
52 Ok(c) => value(c).left(),
53 Err(b1) => is_some(move |b2| decode_two(b1, b2)).right(),
54 })
55 .expect("UTF-16 character")
56}
57
58/// A UTF-16 encoded [`u8`] decoder. (big-endian)
59///
60/// # Examples
61/// ```
62/// # futures_executor::block_on(async {
63/// # use somen_decode::utf16_be;
64/// use somen::prelude::*;
65///
66/// let mut parser = utf16_be();
67/// let mut stream = stream::from_slice(
68/// b"\xD8\x34\xDD\x1E\x00\x4d\x00\x75\x00\x73\x00\x69\x00\x63\xD8\x34",
69/// );
70///
71/// assert_eq!(parser.parse(&mut stream).await, Ok('𝄞'));
72/// assert_eq!(parser.parse(&mut stream).await, Ok('M'));
73/// assert_eq!(parser.parse(&mut stream).await, Ok('u'));
74/// assert_eq!(parser.parse(&mut stream).await, Ok('s'));
75/// assert_eq!(parser.parse(&mut stream).await, Ok('i'));
76/// assert_eq!(parser.parse(&mut stream).await, Ok('c'));
77/// assert!(parser.parse(&mut stream).await.is_err());
78/// # });
79/// ```
80pub fn utf16_be<'a, I>() -> impl Parser<I, Output = char>
81where
82 I: Positioned<Ok = u8> + ?Sized + 'a,
83{
84 any()
85 .times(2)
86 .fill(0)
87 .try_map(|b| decode_one(u16::from_be_bytes(b.unwrap())).ok_or("UTF-16BE character"))
88 .rewindable()
89 .then(|res| match res {
90 Ok(c) => value(c).left(),
91 Err(b1) => any()
92 .times(2)
93 .fill(0)
94 .try_map(move |b2| {
95 decode_two(b1, u16::from_be_bytes(b2.unwrap())).ok_or("UTF-16BE character")
96 })
97 .right(),
98 })
99}
100
101/// A UTF-16 encoded [`u8`] decoder. (little-endian)
102///
103/// # Examples
104/// ```
105/// # futures_executor::block_on(async {
106/// # use somen_decode::utf16_le;
107/// use somen::prelude::*;
108///
109/// let mut parser = utf16_le();
110/// let mut stream = stream::from_slice(
111/// b"\x34\xD8\x1E\xDD\x4d\x00\x75\x00\x73\x00\x69\x00\x63\x00\x34\xD8",
112/// );
113///
114/// assert_eq!(parser.parse(&mut stream).await, Ok('𝄞'));
115/// assert_eq!(parser.parse(&mut stream).await, Ok('M'));
116/// assert_eq!(parser.parse(&mut stream).await, Ok('u'));
117/// assert_eq!(parser.parse(&mut stream).await, Ok('s'));
118/// assert_eq!(parser.parse(&mut stream).await, Ok('i'));
119/// assert_eq!(parser.parse(&mut stream).await, Ok('c'));
120/// assert!(parser.parse(&mut stream).await.is_err());
121/// # });
122/// ```
123pub fn utf16_le<'a, I>() -> impl Parser<I, Output = char>
124where
125 I: Positioned<Ok = u8> + ?Sized + 'a,
126{
127 any()
128 .times(2)
129 .fill(0)
130 .try_map(|b| decode_one(u16::from_le_bytes(b.unwrap())).ok_or("UTF-16LE character"))
131 .rewindable()
132 .then(|res| match res {
133 Ok(c) => value(c).left(),
134 Err(b1) => any()
135 .times(2)
136 .fill(0)
137 .try_map(move |b2| {
138 decode_two(b1, u16::from_le_bytes(b2.unwrap())).ok_or("UTF-16LE character")
139 })
140 .right(),
141 })
142}