shatter/
parser.rs

1#[derive(Debug, PartialEq, Eq)]
2pub struct Parser<'a> {
3    data: &'a [u8],
4    pos: usize,
5}
6
7#[derive(Debug, PartialEq, Eq)]
8pub enum Bound {
9    Start,
10    End,
11}
12
13#[derive(Debug, PartialEq, Eq)]
14pub enum Error {
15    NotFound,
16    BadUtf8Encoding,
17    OutOfBounds(Bound),
18}
19
20pub type Result<T> = std::result::Result<T, Error>;
21
22pub fn is_oob<T>(r: Result<T>) -> bool {
23    match r {
24        Err(Error::OutOfBounds(_)) => true,
25        Err(_) => false,
26        Ok(_) => false,
27    }
28}
29
30impl<'a> Parser<'a> {
31    pub fn from_bytes(data: &'a [u8]) -> Parser<'a> {
32        Parser { data: data, pos: 0 }
33    }
34
35    pub fn from_str(string: &'a str) -> Parser<'a> {
36        Parser {
37            data: string.as_bytes(),
38            pos: 0,
39        }
40    }
41
42    #[inline(always)]
43    pub fn set_pos(&mut self, pos: usize) {
44        self.pos = pos;
45    }
46
47    #[inline(always)]
48    pub fn pos(&self) -> usize {
49        self.pos
50    }
51
52    #[inline(always)]
53    pub fn data(&self) -> &[u8] {
54        self.data
55    }
56
57    #[inline(always)]
58    pub fn len(&self) -> usize {
59        self.data.len()
60    }
61
62    pub fn pull_byte(&mut self) -> Result<u8> {
63        if self.pos + 1 > self.len() {
64            return Err(Error::OutOfBounds(Bound::End));
65        }
66
67        let c = self.data[self.pos];
68        self.pos += 1;
69        return Ok(c);
70    }
71
72    pub fn skip<F: Fn(char) -> bool>(&mut self, should_skip: F) -> Result<()> {
73        let len = self.len();
74        while self.pos < len {
75            let prev = self.pos();
76            if should_skip(self.pull_char()?) {
77                continue;
78            } else {
79                self.set_pos(prev);
80                return Ok(());
81            }
82        }
83
84        return Err(Error::OutOfBounds(Bound::End));
85    }
86
87    pub fn skip_whitespace(&mut self) -> Result<()> {
88        self.skip(|c| c.is_ascii_whitespace())
89    }
90
91    pub fn parse_digits(&mut self) -> Result<u16> {
92        let mut i = self.pos();
93        let mut digits = 0u16;
94        let mut number = 0u16;
95
96        while i < self.len() {
97            let byte = self.data()[i];
98
99            // if it's a utf8 char this is not a digit
100            if (byte & 0x80) == 0x80 || !(byte as char).is_ascii_digit() || digits == 5 {
101                break;
102            }
103
104            let digit = (byte - b'0') as u16;
105
106            number = number.saturating_mul(10).saturating_add(digit);
107
108            digits += 1;
109            i += 1;
110        }
111
112        if digits == 0 || digits > 5 {
113            return Err(Error::NotFound);
114        }
115
116        self.set_pos(i);
117        Ok(number)
118    }
119
120    /// Parser a specific character. If not found, do not advance the parser.
121    pub fn parse_char(&mut self, matching: char) -> Result<()> {
122        let start = self.pos();
123        let c = self.pull_char()?;
124        if c == matching {
125            return Ok(());
126        }
127        self.set_pos(start);
128        return Err(Error::NotFound);
129    }
130
131    pub fn peek_char(&mut self) -> Result<char> {
132        let peek = true;
133        self.pull_or_peek_char(peek)
134    }
135
136    pub fn pull_char(&mut self) -> Result<char> {
137        let peek = false;
138        self.pull_or_peek_char(peek)
139    }
140
141    pub fn peek_prev_byte(&mut self) -> Result<u8> {
142        if self.pos == 0 {
143            return Err(Error::OutOfBounds(Bound::Start));
144        }
145
146        Ok(self.data[self.pos - 1])
147    }
148
149    pub fn seek_prev_byte(&mut self) -> Result<()> {
150        if self.pos == 0 {
151            return Err(Error::OutOfBounds(Bound::Start));
152        }
153        self.pos -= 1;
154
155        Ok(())
156    }
157
158    pub fn peek_prev_char(&self) -> Result<char> {
159        let mut i = 1;
160        let codepoint: u32;
161        let mut bs: [u32; 4] = [0; 4];
162
163        if self.pos == 0 {
164            return Err(Error::OutOfBounds(Bound::Start));
165        }
166
167        while i <= 4 && ((self.pos as i32) - (i as i32) >= 0) {
168            let byte = self.data[self.pos - i] as u32;
169            let masked = byte & 0b11000000;
170            if masked == 0b10000000 {
171                // continuation byte
172                bs[i - 1] = byte & 0b00111111;
173                i += 1;
174            } else if masked == 0b11000000 {
175                // start byte
176                match i {
177                    4 => {
178                        codepoint = ((bs[3] & 0x07) << 18)
179                            | ((bs[2] & 0x3F) << 12)
180                            | ((bs[1] & 0x3F) << 6)
181                            | (bs[0] & 0x3F)
182                    }
183                    3 => {
184                        codepoint = ((bs[2] & 0x0F) << 12) | ((bs[1] & 0x3F) << 6) | (bs[0] & 0x3F)
185                    }
186                    2 => codepoint = ((bs[1] & 0x0F) << 6) | (bs[0] & 0x3F),
187                    _ => return Err(Error::BadUtf8Encoding),
188                }
189                return parser_codepoint_char(codepoint);
190            } else {
191                return parser_codepoint_char(byte);
192            }
193        }
194
195        // If we reached here, we reached the start of the string without finding a non-continuation byte.
196        Err(Error::BadUtf8Encoding)
197    }
198
199    pub fn seek_prev_char(&mut self) -> Result<()> {
200        self.seek_prev_byte()?;
201        while self.pos > 0 && (self.data[self.pos] & 0b11000000) == 0b10000000 {
202            self.pos -= 1;
203        }
204
205        Ok(())
206    }
207
208    fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> {
209        let mut codepoint: u32 = 0;
210
211        let start = self.pos;
212        let b0 = self.pull_byte()? as u32;
213
214        if b0 & 0x80 != 0 {
215            if (b0 & 0b11100000) == 0b11000000 {
216                // Two-byte sequence
217                let b1 = self.pull_byte()? as u32;
218                codepoint = ((b0 & 0b00011111) << 6) | (b1 & 0b00111111);
219            } else if (b0 & 0xF0) == 0xE0 {
220                // Three-byte sequence
221                let b1 = self.pull_byte()? as u32;
222                let b2 = self.pull_byte()? as u32;
223                codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
224            } else if (b0 & 0xF8) == 0xF0 {
225                // Four-byte sequence
226                let b1 = self.pull_byte()? as u32;
227                let b2 = self.pull_byte()? as u32;
228                let b3 = self.pull_byte()? as u32;
229                codepoint =
230                    ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
231            }
232        } else {
233            // Single-byte ASCII character
234            return Ok((b0 as u8) as char);
235        }
236
237        if peek {
238            self.pos = start;
239        }
240
241        match std::char::from_u32(codepoint) {
242            Some(c) => Ok(c),
243            None => Err(Error::BadUtf8Encoding),
244        }
245    }
246
247    pub fn parse_until_char(&mut self, needle: char) -> Result<()> {
248        self.parse_until(|c| c == needle)
249    }
250
251    pub fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> {
252        let len = self.len();
253        while self.pos < len {
254            let byte = self.data[self.pos];
255            let prev = self.pos;
256
257            let chr = if is_utf8(byte) {
258                self.pull_char()?
259            } else {
260                self.pos += 1;
261                byte as char
262            };
263
264            if matches(chr) {
265                self.pos = prev;
266                return Ok(());
267            }
268        }
269
270        Err(Error::OutOfBounds(Bound::End))
271    }
272}
273
274fn parser_codepoint_char(codepoint: u32) -> Result<char> {
275    match std::char::from_u32(codepoint) {
276        Some(c) => Ok(c),
277        None => Err(Error::BadUtf8Encoding),
278    }
279}
280
281#[cfg(test)]
282mod test {
283    use super::*;
284
285    #[test]
286    fn test_parser() -> Result<()> {
287        //             v alien  v
288        // 00000000: 20f0 9f91 bd23 6861 7368 7461 670a       _....#hashtag.
289        let s = " #hashtag ";
290        let mut parser = Parser::from_str(s);
291        let mut res = parser.parse_until_char('#');
292        assert_eq!(res, Ok(()));
293        assert_eq!(parser.pos, 1);
294        res = parser.parse_until_char('t');
295        assert_eq!(res, Ok(()));
296        assert_eq!(parser.pos, 6);
297        Ok(())
298    }
299
300    #[test]
301    fn test_parse_digits() {
302        let s = "[1315]";
303        let mut parser = Parser::from_str(s);
304        let r1 = parser.parse_char('[');
305        assert_eq!(r1, Ok(()));
306        let r2 = parser.parse_digits();
307        assert_eq!(r2, Ok(1315));
308        assert_eq!(parser.pos(), 5);
309    }
310
311    #[test]
312    fn test_peek_prev_char() {
313        let s = ".👽.";
314        let mut parser = Parser::from_str(s);
315        let r1 = parser.parse_until_char('👽');
316        assert_eq!(r1, Ok(()));
317        let r2 = parser.pull_char();
318        assert_eq!(r2, Ok('👽'));
319        let r3 = parser.peek_prev_char();
320        assert_eq!(r3, Ok('👽'));
321        assert_eq!(parser.pos(), 5);
322    }
323
324    #[test]
325    fn test_utf8_parsing() -> Result<()> {
326        let s = "hey there #👽.";
327        let mut parser = Parser::from_str(s);
328        let _ = parser.parse_until_char('👽');
329        assert_eq!(parser.peek_char(), Ok('👽'));
330        assert_eq!(parser.pos, 11);
331        let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation());
332        assert_eq!(res, Ok(()));
333        assert_eq!(parser.peek_char(), Ok('.'));
334        Ok(())
335    }
336}
337
338#[inline(always)]
339fn is_utf8(byte: u8) -> bool {
340    (byte & 0x80) == 0x80
341}