hefty/
extract_utf8.rs

1use bytes::Buf as _;
2
3use crate::iterable::OutputToByteStream;
4use crate::repeatable::Repeatable;
5use crate::{ByteStream, Extract, ParseAny, ParseResult, ParseWhen};
6
7const UTF8_CHAR_WIDTH: &[u8; 256] = &[
8    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
9    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
10    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
11    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
12    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
13    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
14    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
15    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
16    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
17    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
18    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
19    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
20    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
21    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
22    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
23    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
24    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
25];
26
27// Copy of unstable function `utf8_char_width`
28/// Given a first byte, determines how many bytes are in this UTF-8 character.
29#[must_use]
30#[inline]
31const fn utf8_char_width(b: u8) -> usize {
32    UTF8_CHAR_WIDTH[b as usize] as usize
33}
34
35// 'a'
36impl Extract for char {
37    type State = (u8, ByteStream);
38    type Output = ByteStream;
39
40    fn extract(
41        &self,
42        mut input: ByteStream,
43        state: Option<Self::State>,
44        last: bool,
45    ) -> ParseResult<Self::State, ByteStream> {
46        let mut char_buf = [0u8; 4];
47        let mut bytes = self.encode_utf8(&mut char_buf).as_bytes();
48        let (mut seen, mut output) = state.unwrap_or((0, ByteStream::new(input.position())));
49        bytes.advance(seen as usize);
50        let matched = input.common_prefix_length(bytes);
51        if matched == bytes.len() {
52            output.merge(input.take_before(matched));
53            ParseResult::Match(output, input)
54        } else if !last && matched == input.remaining() {
55            output.merge(input.take_before(matched));
56            seen += matched as u8;
57            ParseResult::Partial((seen, output))
58        } else {
59            ParseResult::NoMatch(output.position())
60        }
61    }
62}
63
64// 'a'.optional()
65impl Repeatable for char {}
66
67impl OutputToByteStream for char {
68    fn output_to_bytestream(output: Self::Output) -> ByteStream {
69        output
70    }
71}
72
73// "abc"
74impl Extract for &str {
75    type State = (usize, ByteStream);
76    type Output = ByteStream;
77
78    fn extract(
79        &self,
80        mut input: ByteStream,
81        state: Option<Self::State>,
82        last: bool,
83    ) -> ParseResult<Self::State, ByteStream> {
84        let mut bytes = self.as_bytes();
85        let (mut seen, mut output) = state.unwrap_or((0, ByteStream::new(input.position())));
86        bytes.advance(seen);
87        let matched = input.common_prefix_length(bytes);
88        if matched == bytes.len() {
89            output.merge(input.take_before(matched));
90            ParseResult::Match(output, input)
91        } else if !last && matched == input.remaining() {
92            output.merge(input.take_before(matched));
93            seen += matched;
94            ParseResult::Partial((seen, output))
95        } else {
96            ParseResult::NoMatch(output.position())
97        }
98    }
99}
100
101impl Repeatable for &str {}
102
103impl OutputToByteStream for &str {
104    fn output_to_bytestream(output: Self::Output) -> ByteStream {
105        output
106    }
107}
108
109pub struct AnyCharParser;
110
111// char::any()
112impl Extract for AnyCharParser {
113    type State = (usize, ByteStream);
114    type Output = ByteStream;
115
116    fn extract(
117        &self,
118        mut input: ByteStream,
119        state: Option<Self::State>,
120        last: bool,
121    ) -> ParseResult<Self::State, ByteStream> {
122        let (mut required, mut output) = state.unwrap_or((0, ByteStream::new(input.position())));
123        if required == 0 {
124            match input.iter().next() {
125                Some(&b) => {
126                    required = utf8_char_width(b);
127                    if required == 0 {
128                        return ParseResult::NoMatch(output.position());
129                    }
130                }
131                None if last => {
132                    return ParseResult::NoMatch(output.position());
133                }
134                None => {
135                    return ParseResult::Partial((required, output));
136                }
137            }
138        }
139        let input_len = input.remaining();
140        if input_len < required {
141            if last {
142                ParseResult::NoMatch(output.position())
143            } else {
144                required -= input_len;
145                output.merge(input.take_before(input_len));
146                ParseResult::Partial((required, output))
147            }
148        } else {
149            output.merge(input.take_before(required));
150            ParseResult::Match(output, input)
151        }
152    }
153}
154
155// char::any().optional()
156impl Repeatable for AnyCharParser {}
157
158impl OutputToByteStream for AnyCharParser {
159    fn output_to_bytestream(output: Self::Output) -> ByteStream {
160        output
161    }
162}
163
164impl ParseAny for char {
165    type Parser = AnyCharParser;
166
167    fn any() -> Self::Parser {
168        AnyCharParser
169    }
170}
171
172#[derive(Clone)]
173pub struct CharWhenParser<F>(F);
174
175// char::when(|c|...)
176impl<F> Extract for CharWhenParser<F>
177where
178    F: Fn(char) -> bool,
179{
180    type State = (usize, ByteStream);
181    type Output = ByteStream;
182
183    fn extract(
184        &self,
185        mut input: ByteStream,
186        state: Option<Self::State>,
187        last: bool,
188    ) -> ParseResult<Self::State, ByteStream> {
189        let (mut required, mut output) = state.unwrap_or((0, ByteStream::new(input.position())));
190        if required == 0 {
191            match input.iter().next() {
192                Some(&b) => {
193                    required = utf8_char_width(b);
194                    if required == 0 {
195                        return ParseResult::NoMatch(output.position());
196                    }
197                }
198                None if last => {
199                    return ParseResult::NoMatch(output.position());
200                }
201                None => {
202                    return ParseResult::Partial((required, output));
203                }
204            }
205        }
206        let input_len = input.remaining();
207        if input_len < required {
208            if last {
209                ParseResult::NoMatch(output.position())
210            } else {
211                output.merge(input.take_before(input_len));
212                ParseResult::Partial((required - input_len, output))
213            }
214        } else {
215            output.merge(input.take_before(required));
216            let mut bytes = [0; 4];
217            let len = output.fill_slice(&mut bytes);
218            match std::str::from_utf8(&bytes[..len]) {
219                Ok(s) => {
220                    let Some(c) = s.chars().next() else {
221                        return ParseResult::NoMatch(output.position());
222                    };
223                    if (self.0)(c) {
224                        ParseResult::Match(output, input)
225                    } else {
226                        ParseResult::NoMatch(output.position())
227                    }
228                }
229                Err(_) => ParseResult::NoMatch(output.position()),
230            }
231        }
232    }
233}
234
235// char::when(|c|...).optional()
236impl<F> Repeatable for CharWhenParser<F> where F: Fn(char) -> bool {}
237
238impl<F> OutputToByteStream for CharWhenParser<F>
239where
240    F: Fn(char) -> bool,
241{
242    fn output_to_bytestream(output: Self::Output) -> ByteStream {
243        output
244    }
245}
246
247impl<F> ParseWhen<char, F> for char
248where
249    F: Fn(char) -> bool,
250{
251    type Parser = CharWhenParser<F>;
252
253    fn when(f: F) -> Self::Parser {
254        CharWhenParser(f)
255    }
256}