1#[derive(Debug, PartialEq, Eq)]
2pub struct Parser<'a> {
3 data: &'a [u8],
4 pos: usize,
5}
6
7#[derive(Debug, PartialEq, Eq)]
8pub enum Bound {
9 Start,
10 End,
11}
12
13#[derive(Debug, PartialEq, Eq)]
14pub enum Error {
15 NotFound,
16 BadUtf8Encoding,
17 OutOfBounds(Bound),
18}
19
20pub type Result<T> = std::result::Result<T, Error>;
21
22pub fn is_oob<T>(r: Result<T>) -> bool {
23 match r {
24 Err(Error::OutOfBounds(_)) => true,
25 Err(_) => false,
26 Ok(_) => false,
27 }
28}
29
30impl<'a> Parser<'a> {
31 pub fn from_bytes(data: &'a [u8]) -> Parser<'a> {
32 Parser { data: data, pos: 0 }
33 }
34
35 pub fn from_str(string: &'a str) -> Parser<'a> {
36 Parser {
37 data: string.as_bytes(),
38 pos: 0,
39 }
40 }
41
42 #[inline(always)]
43 pub fn set_pos(&mut self, pos: usize) {
44 self.pos = pos;
45 }
46
47 #[inline(always)]
48 pub fn pos(&self) -> usize {
49 self.pos
50 }
51
52 #[inline(always)]
53 pub fn data(&self) -> &[u8] {
54 self.data
55 }
56
57 #[inline(always)]
58 pub fn len(&self) -> usize {
59 self.data.len()
60 }
61
62 pub fn pull_byte(&mut self) -> Result<u8> {
63 if self.pos + 1 > self.len() {
64 return Err(Error::OutOfBounds(Bound::End));
65 }
66
67 let c = self.data[self.pos];
68 self.pos += 1;
69 return Ok(c);
70 }
71
72 pub fn skip<F: Fn(char) -> bool>(&mut self, should_skip: F) -> Result<()> {
73 let len = self.len();
74 while self.pos < len {
75 let prev = self.pos();
76 if should_skip(self.pull_char()?) {
77 continue;
78 } else {
79 self.set_pos(prev);
80 return Ok(());
81 }
82 }
83
84 return Err(Error::OutOfBounds(Bound::End));
85 }
86
87 pub fn skip_whitespace(&mut self) -> Result<()> {
88 self.skip(|c| c.is_ascii_whitespace())
89 }
90
91 pub fn parse_digits(&mut self) -> Result<u16> {
92 let mut i = self.pos();
93 let mut digits = 0u16;
94 let mut number = 0u16;
95
96 while i < self.len() {
97 let byte = self.data()[i];
98
99 if (byte & 0x80) == 0x80 || !(byte as char).is_ascii_digit() || digits == 5 {
101 break;
102 }
103
104 let digit = (byte - b'0') as u16;
105
106 number = number.saturating_mul(10).saturating_add(digit);
107
108 digits += 1;
109 i += 1;
110 }
111
112 if digits == 0 || digits > 5 {
113 return Err(Error::NotFound);
114 }
115
116 self.set_pos(i);
117 Ok(number)
118 }
119
120 pub fn parse_char(&mut self, matching: char) -> Result<()> {
122 let start = self.pos();
123 let c = self.pull_char()?;
124 if c == matching {
125 return Ok(());
126 }
127 self.set_pos(start);
128 return Err(Error::NotFound);
129 }
130
131 pub fn peek_char(&mut self) -> Result<char> {
132 let peek = true;
133 self.pull_or_peek_char(peek)
134 }
135
136 pub fn pull_char(&mut self) -> Result<char> {
137 let peek = false;
138 self.pull_or_peek_char(peek)
139 }
140
141 pub fn peek_prev_byte(&mut self) -> Result<u8> {
142 if self.pos == 0 {
143 return Err(Error::OutOfBounds(Bound::Start));
144 }
145
146 Ok(self.data[self.pos - 1])
147 }
148
149 pub fn seek_prev_byte(&mut self) -> Result<()> {
150 if self.pos == 0 {
151 return Err(Error::OutOfBounds(Bound::Start));
152 }
153 self.pos -= 1;
154
155 Ok(())
156 }
157
158 pub fn peek_prev_char(&self) -> Result<char> {
159 let mut i = 1;
160 let codepoint: u32;
161 let mut bs: [u32; 4] = [0; 4];
162
163 if self.pos == 0 {
164 return Err(Error::OutOfBounds(Bound::Start));
165 }
166
167 while i <= 4 && ((self.pos as i32) - (i as i32) >= 0) {
168 let byte = self.data[self.pos - i] as u32;
169 let masked = byte & 0b11000000;
170 if masked == 0b10000000 {
171 bs[i - 1] = byte & 0b00111111;
173 i += 1;
174 } else if masked == 0b11000000 {
175 match i {
177 4 => {
178 codepoint = ((bs[3] & 0x07) << 18)
179 | ((bs[2] & 0x3F) << 12)
180 | ((bs[1] & 0x3F) << 6)
181 | (bs[0] & 0x3F)
182 }
183 3 => {
184 codepoint = ((bs[2] & 0x0F) << 12) | ((bs[1] & 0x3F) << 6) | (bs[0] & 0x3F)
185 }
186 2 => codepoint = ((bs[1] & 0x0F) << 6) | (bs[0] & 0x3F),
187 _ => return Err(Error::BadUtf8Encoding),
188 }
189 return parser_codepoint_char(codepoint);
190 } else {
191 return parser_codepoint_char(byte);
192 }
193 }
194
195 Err(Error::BadUtf8Encoding)
197 }
198
199 pub fn seek_prev_char(&mut self) -> Result<()> {
200 self.seek_prev_byte()?;
201 while self.pos > 0 && (self.data[self.pos] & 0b11000000) == 0b10000000 {
202 self.pos -= 1;
203 }
204
205 Ok(())
206 }
207
208 fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> {
209 let mut codepoint: u32 = 0;
210
211 let start = self.pos;
212 let b0 = self.pull_byte()? as u32;
213
214 if b0 & 0x80 != 0 {
215 if (b0 & 0b11100000) == 0b11000000 {
216 let b1 = self.pull_byte()? as u32;
218 codepoint = ((b0 & 0b00011111) << 6) | (b1 & 0b00111111);
219 } else if (b0 & 0xF0) == 0xE0 {
220 let b1 = self.pull_byte()? as u32;
222 let b2 = self.pull_byte()? as u32;
223 codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
224 } else if (b0 & 0xF8) == 0xF0 {
225 let b1 = self.pull_byte()? as u32;
227 let b2 = self.pull_byte()? as u32;
228 let b3 = self.pull_byte()? as u32;
229 codepoint =
230 ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
231 }
232 } else {
233 return Ok((b0 as u8) as char);
235 }
236
237 if peek {
238 self.pos = start;
239 }
240
241 match std::char::from_u32(codepoint) {
242 Some(c) => Ok(c),
243 None => Err(Error::BadUtf8Encoding),
244 }
245 }
246
247 pub fn parse_until_char(&mut self, needle: char) -> Result<()> {
248 self.parse_until(|c| c == needle)
249 }
250
251 pub fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> {
252 let len = self.len();
253 while self.pos < len {
254 let byte = self.data[self.pos];
255 let prev = self.pos;
256
257 let chr = if is_utf8(byte) {
258 self.pull_char()?
259 } else {
260 self.pos += 1;
261 byte as char
262 };
263
264 if matches(chr) {
265 self.pos = prev;
266 return Ok(());
267 }
268 }
269
270 Err(Error::OutOfBounds(Bound::End))
271 }
272}
273
274fn parser_codepoint_char(codepoint: u32) -> Result<char> {
275 match std::char::from_u32(codepoint) {
276 Some(c) => Ok(c),
277 None => Err(Error::BadUtf8Encoding),
278 }
279}
280
281#[cfg(test)]
282mod test {
283 use super::*;
284
285 #[test]
286 fn test_parser() -> Result<()> {
287 let s = " #hashtag ";
290 let mut parser = Parser::from_str(s);
291 let mut res = parser.parse_until_char('#');
292 assert_eq!(res, Ok(()));
293 assert_eq!(parser.pos, 1);
294 res = parser.parse_until_char('t');
295 assert_eq!(res, Ok(()));
296 assert_eq!(parser.pos, 6);
297 Ok(())
298 }
299
300 #[test]
301 fn test_parse_digits() {
302 let s = "[1315]";
303 let mut parser = Parser::from_str(s);
304 let r1 = parser.parse_char('[');
305 assert_eq!(r1, Ok(()));
306 let r2 = parser.parse_digits();
307 assert_eq!(r2, Ok(1315));
308 assert_eq!(parser.pos(), 5);
309 }
310
311 #[test]
312 fn test_peek_prev_char() {
313 let s = ".👽.";
314 let mut parser = Parser::from_str(s);
315 let r1 = parser.parse_until_char('👽');
316 assert_eq!(r1, Ok(()));
317 let r2 = parser.pull_char();
318 assert_eq!(r2, Ok('👽'));
319 let r3 = parser.peek_prev_char();
320 assert_eq!(r3, Ok('👽'));
321 assert_eq!(parser.pos(), 5);
322 }
323
324 #[test]
325 fn test_utf8_parsing() -> Result<()> {
326 let s = "hey there #👽.";
327 let mut parser = Parser::from_str(s);
328 let _ = parser.parse_until_char('👽');
329 assert_eq!(parser.peek_char(), Ok('👽'));
330 assert_eq!(parser.pos, 11);
331 let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation());
332 assert_eq!(res, Ok(()));
333 assert_eq!(parser.peek_char(), Ok('.'));
334 Ok(())
335 }
336}
337
338#[inline(always)]
339fn is_utf8(byte: u8) -> bool {
340 (byte & 0x80) == 0x80
341}