atm_parser_helper_common_syntax/
lib.rs

1use atm_parser_helper::{Eoi, Error, ParserHelper};
2
3#[cfg(feature = "arbitrary")]
4pub mod testing;
5
6/// A trait for error types that can represent errors occurring when parsing whitespace.
7pub trait WhiteSpaceE : Eoi {
8    /// Create a value indicating that a comment contains non-UTF-8 content.
9    fn utf8_comment() -> Self;
10}
11
12/// Parse an arbitrary amount of whitespace.
13pub fn spaces<E: WhiteSpaceE>(p: &mut ParserHelper) -> Result<(), Error<E>> {
14    loop {
15        match p.peek_or_end() {
16            Some(0x09) | Some(0x0a) | Some(0x0d) | Some(0x20) => p.advance(1),
17            Some(0x23) => comment(p)?,
18            Some(_) | None => return Ok(()),
19        }
20    }
21}
22
23fn comment<E: WhiteSpaceE>(p: &mut ParserHelper) -> Result<(), Error<E>> {
24    let start = p.position();
25    p.advance(1); // #
26    loop {
27        match p.next_or_end() {
28            Some(0x0a) | None => {
29                match std::str::from_utf8(p.slice(start..p.position())) {
30                    Ok(_) => return Ok(()),
31                    Err(_) => return p.fail_at_position(E::utf8_comment(), start),
32                }
33            }
34            Some(_) => {}
35        }
36    }
37}
38
39/// A trait for error types that can represent errors occurring when parsing an integer literal.
40pub trait IntLiteralE : Eoi {
41    /// Create a value indicating that an integer literal contains no digits.
42    fn int_no_digits() -> Self;
43    /// Create a value indicating that the input does not contain (even the beginning of) an integer literal.
44    fn not_int_literal() -> Self;
45}
46
47/// Parse an integer literal.
48pub fn parse_int<I, E: IntLiteralE>(
49    p: &mut ParserHelper,
50    from_decimal: fn(&str) -> Result<I, E>,
51    from_hex: fn(&str) -> Result<I, E>,
52    from_binary: fn(&str) -> Result<I, E>,
53) -> Result<I, Error<E>> {
54    let start = p.position();
55
56    let negative = p.advance_over(b"-");
57    let has_sign = negative || p.advance_over(b"+");
58
59    let is_hex = !has_sign && p.advance_over(b"0x");
60    let is_binary = !is_hex && (!has_sign && p.advance_over(b"0b"));
61
62    if is_hex {
63        if !is_hex_digit(p.peek()?) {
64            return p.fail(E::int_no_digits());
65        }
66
67        let start = p.position();
68        p.skip(is_hex_digit_or_underscore);
69
70        let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
71        let without_underscores = digits_with_underscores.replace("_", "");
72        match from_hex(&without_underscores) {
73            Ok(n) => return Ok(n),
74            Err(e) => return p.fail(e),
75        }
76    } else if is_binary {
77        if !is_binary_digit(p.peek()?) {
78            return p.fail(E::int_no_digits());
79        }
80
81        let start = p.position();
82        p.skip(is_binary_digit_or_underscore);
83
84        let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
85        let without_underscores = digits_with_underscores.replace("_", "");
86        match from_binary(&without_underscores) {
87            Ok(n) => return Ok(n),
88            Err(e) => return p.fail(e),
89        }
90    } else {
91        if !is_digit(p.peek()?) {
92            if has_sign {
93                return p.fail(E::int_no_digits());
94            } else {
95                return p.fail(E::not_int_literal());
96            }
97        }
98
99        p.skip(is_digit_or_underscore);
100
101        let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
102        let without_underscores = digits_with_underscores.replace("_", "");
103        match from_decimal(&without_underscores) {
104            Ok(n) => return Ok(n),
105            Err(e) => return p.fail(e),
106        }
107    }
108}
109
110/// A trait for error types that can represent errors occurring when parsing an integer literal.
111pub trait FloatLiteralE : Eoi {
112    /// Create a value indicating that a float literal contains no digits before the point.
113    fn float_no_leading_digits() -> Self;
114    /// Create a value indicating that a float literal contains no point.
115    fn float_no_point() -> Self;
116    /// Create a value indicating that a float literal contains no digits after the point.
117    fn float_no_trailing_digits() -> Self;
118    /// Create a value indicating that a float literal contains no digits as the exponent.
119    fn float_no_exponent_digits() -> Self;
120    /// Create a value indicating that the input does not contain (even the beginning of) a float literal.
121    fn not_float_literal() -> Self;
122}
123
124/// Parse a floating-point number literal.
125pub fn parse_float<F, E: FloatLiteralE>(
126    p: &mut ParserHelper,
127    from_s: fn(&str) -> Result<F, E>,
128    neg_inf: F,
129    pos_inf: F,
130    nan: F,
131) -> Result<F, Error<E>> {
132    let start = p.position();
133
134    let negative = p.advance_over(b"-");
135    let has_sign = negative || p.advance_over(b"+");
136
137    match p.peek()? {
138        0x49 => {
139            p.expect_bytes(b"Inf", E::not_float_literal())?;
140            return Ok(if negative { neg_inf } else { pos_inf });
141        }
142        0x4e => {
143            p.expect_bytes(b"NaN", E::not_float_literal())?;
144            return Ok(nan);
145        }
146        _ => {}
147    }
148
149    if !is_digit(p.peek()?) {
150        if has_sign {
151            return p.fail(E::float_no_leading_digits());
152        } else {
153            return p.fail(E::not_float_literal());
154        }
155    }
156    p.skip(is_digit_or_underscore);
157
158    p.expect('.' as u8, E::float_no_point())?;
159
160    if !is_digit(p.peek()?) {
161        return p.fail(E::float_no_trailing_digits());
162    }
163    p.skip(is_digit_or_underscore);
164
165    if let Ok(0x45 | 0x65) = p.peek::<E>() {
166        p.advance(1);
167        let negative = p.advance_over(b"-");
168        if !negative {
169            p.advance_over(b"+");
170        }
171
172        if !is_digit(p.peek()?) {
173            return p.fail(E::float_no_exponent_digits());
174        }
175        p.skip(is_digit_or_underscore);
176    }
177
178    let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
179    let without_underscores = digits_with_underscores.replace("_", "");
180    match from_s(&without_underscores) {
181        Ok(n) => return Ok(n),
182        Err(_) => panic!("Prior parsing should have ensured a valid input to f64::from_str"),
183    }
184}
185
186#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
187pub enum Number<I, F> {
188    Float(F),
189    Integer(I),
190}
191
192/// Parse number literal.
193pub fn parse_number<I, F, E: FloatLiteralE + IntLiteralE>(
194    p: &mut ParserHelper,
195    from_decimal: fn(&str) -> Result<I, E>,
196    from_hex: fn(&str) -> Result<I, E>,
197    from_binary: fn(&str) -> Result<I, E>,
198    from_s: fn(&str) -> Result<F, E>,
199    neg_inf: F,
200    pos_inf: F,
201    nan: F,
202) -> Result<Number<I, F>, Error<E>> {
203    let start = p.position();
204
205    let negative = p.advance_over(b"-");
206    let has_sign = negative || p.advance_over(b"+");
207
208    match p.peek()? {
209        0x49 => {
210            p.expect_bytes(b"Inf", E::not_float_literal())?;
211            return Ok(if negative { Number::Float(neg_inf) } else { Number::Float(pos_inf) });
212        }
213        0x4e => {
214            p.expect_bytes(b"NaN", E::not_float_literal())?;
215            return Ok(Number::Float(nan));
216        }
217        _ => {}
218    }
219
220    let is_hex = !has_sign && p.advance_over(b"0x");
221    let is_binary = !is_hex && (!has_sign && p.advance_over(b"0b"));
222
223    if is_hex {
224        if !is_hex_digit(p.peek()?) {
225            return p.fail(E::int_no_digits());
226        }
227
228        let start = p.position();
229        p.skip(is_hex_digit_or_underscore);
230
231        let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
232        let without_underscores = digits_with_underscores.replace("_", "");
233        match from_hex(&without_underscores) {
234            Ok(n) => return Ok(Number::Integer(n)),
235            Err(e) => return p.fail(e),
236        }
237    } else if is_binary {
238        if !is_binary_digit(p.peek()?) {
239            return p.fail(E::int_no_digits());
240        }
241
242        let start = p.position();
243        p.skip(is_binary_digit_or_underscore);
244
245        let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
246        let without_underscores = digits_with_underscores.replace("_", "");
247        match from_binary(&without_underscores) {
248            Ok(n) => return Ok(Number::Integer(n)),
249            Err(e) => return p.fail(e),
250        }
251    } else {
252        if !is_digit(p.peek()?) {
253            if has_sign {
254                return p.fail(E::int_no_digits());
255            } else {
256                return p.fail(E::not_int_literal());
257            }
258        }
259
260        p.skip(is_digit_or_underscore);
261
262        match p.peek::<E>() {
263            Ok(0x2e) => {
264                p.advance(1);
265                if !is_digit(p.peek()?) {
266                    return p.fail(E::float_no_trailing_digits());
267                }
268                p.skip(is_digit_or_underscore);
269
270                if let Ok(0x45 | 0x65) = p.peek::<E>() {
271                    p.advance(1);
272                    let negative = p.advance_over(b"-");
273                    if !negative {
274                        p.advance_over(b"+");
275                    }
276
277                    if !is_digit(p.peek()?) {
278                        return p.fail(E::float_no_exponent_digits());
279                    }
280                    p.skip(is_digit_or_underscore);
281                }
282
283                let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
284                let without_underscores = digits_with_underscores.replace("_", "");
285                match from_s(&without_underscores) {
286                    Ok(n) => return Ok(Number::Float(n)),
287                    Err(_) => panic!("Prior parsing should have ensured a valid input to f64::from_str"),
288                }
289            }
290
291            _ => {
292                let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
293                let without_underscores = digits_with_underscores.replace("_", "");
294                match from_decimal(&without_underscores) {
295                    Ok(n) => return Ok(Number::Integer(n)),
296                    Err(e) => return p.fail(e),
297                }
298            }
299        }
300    }
301}
302
303/// A trait for error types that can represent errors occurring when parsing a byte string literal.
304pub trait ByteStringLiteralE : Eoi + WhiteSpaceE + IntLiteralE {
305    /// Create a value indicating that a hexdecimal byte string literal contains an odd number of digits.
306    fn odd_hex_digits() -> Self;
307    /// Create a value indicating that a binary byte string literal contains a number of digits not divisible by eight.
308    fn number_binary_digits() -> Self;
309    /// Create a value indicating that the next input byte should be a ','.
310    fn expected_comma() -> Self;
311    /// Create a value indicating that a integer byte string literal contains a number that is not a u8.
312    fn byte_out_of_bounds() -> Self;
313    /// Create a value indicating that the input does not contain (even the beginning of) a byte string literal.
314    fn not_byte_string_literal() -> Self;
315}
316
317/// Parse a byte string literal.
318pub fn parse_byte_string<E: ByteStringLiteralE>(p: &mut ParserHelper) -> Result<Vec<u8>, Error<E>> {
319    p.expect('@' as u8, E::not_byte_string_literal())?;
320    match p.next()? {
321        0x5b => {
322            let mut r = Vec::new();
323            loop {
324                spaces(p)?;
325                if p.peek()? == (']' as u8) {
326                    p.advance(1);
327                    return Ok(r);
328                }
329
330                let b = parse_int(p, u8_from_decimal, u8_from_hex, u8_from_binary)?;
331                r.push(b);
332
333                spaces(p)?;
334
335                if p.peek()? == (']' as u8) {
336                    p.advance(1);
337                    return Ok(r);
338                } else if p.peek()? == (',' as u8) {
339                    p.advance(1);
340                } else {
341                    return p.fail(E::expected_comma());
342                }
343            }
344        }
345        0x78 => {
346            let start = p.position();
347            p.skip(is_hex_digit_or_underscore);
348
349            let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
350            let without_underscores = digits_with_underscores.replace("_", "");
351
352            if without_underscores.len() % 2 == 0 {
353                let mut buf = Vec::new();
354                let mut i = 0;
355                while i < without_underscores.len() {
356                    buf.push(u8::from_str_radix(unsafe {std::str::from_utf8_unchecked(&without_underscores.as_bytes()[i..i + 2])}, 16).unwrap());
357                    i += 2;
358                }
359                return Ok(buf);
360            } else {
361                p.fail(E::odd_hex_digits())
362            }
363        }
364        0x62 => {
365            let start = p.position();
366            p.skip(is_binary_digit_or_underscore);
367
368            let digits_with_underscores = unsafe { std::str::from_utf8_unchecked(p.slice(start..p.position())) };
369            let without_underscores = digits_with_underscores.replace("_", "");
370
371            if without_underscores.len() % 8 == 0 {
372                let mut buf = Vec::new();
373                let mut i = 0;
374                while i < without_underscores.len() {
375                    buf.push(u8::from_str_radix(unsafe {std::str::from_utf8_unchecked(&without_underscores.as_bytes()[i..i + 8])}, 2).unwrap());
376                    i += 8;
377                }
378                return Ok(buf);
379            } else {
380                p.fail(E::number_binary_digits())
381            }
382        }
383        _ => p.fail(E::not_byte_string_literal()),
384    }
385}
386
387/// A trait for error types that can represent errors occurring when parsing a UTF-8 string literal.
388pub trait Utf8StringLiteralE : Eoi {
389    /// Create a value indicating that a raw UTF-8 string literal contains non-utf8 bytes.
390    fn raw_not_utf8() -> Self;
391    /// Create a value indicating that a raw UTF-8 string literal starts with more than 255 `@`.
392    fn raw_too_many_ats() -> Self;
393    /// Create a value indicating that an escaping UTF-8 string literal contains non-utf8 bytes.
394    fn escaping_not_utf8() -> Self;
395    /// Create a value indicating that an escaping UTF-8 string literal contains a `\` followed by an invalid character.
396    fn invalid_escape_sequence() -> Self;
397    /// Create a value indicating that a unicode escape sequence contains an invalid number of digits.
398    fn unicode_escape_number_digits() -> Self;
399    /// Create a value indicating that a unicode escape sequence encodes a number that is not a unicode scalar.
400    fn unicode_escape_invalid_scalar() -> Self;
401    /// Create a value indicating that a unicode escape sequence is not terminated by a `}`.
402    fn unicode_escape_no_closing() -> Self;
403    /// Create a value indicating that the input does not contain (even the beginning of) a UTF-8 string literal.
404    fn not_utf8_string_literal() -> Self;
405}
406
407/// Parse a UTF-8 string literal.
408pub fn parse_utf8_string<E: Utf8StringLiteralE>(p: &mut ParserHelper) -> Result<String, Error<E>> {
409    let start_ats = p.position();
410    p.skip(is_at);
411    let ats = p.position() - start_ats;
412
413    p.expect('"' as u8, E::not_utf8_string_literal())?;
414    let start = p.position();
415
416    if ats == 0 {
417        let mut s = String::new();
418
419        loop {
420            if p.advance_over(b"\"") {
421                return Ok(s);
422            } else {
423                s.push(parse_char(p)?);
424            }
425        }
426    } else {
427        let mut consecutive_ats = None;
428        let mut end = 0;
429        loop {
430            let b = p.next()?;
431            match b {
432                0x22 => {
433                    consecutive_ats = Some(0);
434                    end = p.position() - 1;
435                }
436                0x40 => {
437                    match consecutive_ats.as_mut() {
438                        None => {}
439                        Some(n) => {
440                            *n += 1;
441                            if *n > 255 {
442                                return p.fail(E::raw_too_many_ats());
443                            }
444                            if *n == ats {
445                                return std::str::from_utf8(p.slice(start..end))
446                                    .map(|s| s.to_string())
447                                    .map_err(|_| p.fail::<(), E>(E::raw_not_utf8()).unwrap_err());
448                            }
449                        }
450                    }
451                }
452                _ => consecutive_ats = None,
453            }
454        }
455    }
456}
457
458fn parse_char<E: Utf8StringLiteralE>(p: &mut ParserHelper) -> Result<char, Error<E>> {
459    let start = p.position();
460    let fst = p.next()?;
461    let mut scalar;
462    if (fst & 0b1000_0000) == 0b0000_0000 {
463        scalar = fst as u32;
464    } else if (fst & 0b1110_0000) == 0b1100_0000 {
465        scalar = (fst & 0b0001_1111) as u32;
466        scalar <<= 6;
467        scalar = ((p.next()? & 0b0011_1111) as u32) | scalar;
468    } else if (fst & 0b1111_0000) == 0b1110_0000 {
469        scalar = (fst & 0b0000_1111) as u32;
470        scalar <<= 6;
471        scalar = ((p.next()? & 0b0011_1111) as u32) | scalar;
472        scalar <<= 6;
473        scalar = ((p.next()? & 0b0011_1111) as u32) | scalar;
474    } else if (fst & 0b1111_1000) == 0b1111_0000 {
475        scalar = (fst & 0b0000_0111) as u32;
476        scalar <<= 6;
477        scalar = ((p.next()? & 0b0011_1111) as u32) | scalar;
478        scalar <<= 6;
479        scalar = ((p.next()? & 0b0011_1111) as u32) | scalar;
480        scalar <<= 6;
481        scalar = ((p.next()? & 0b0011_1111) as u32) | scalar;
482    } else {
483        return p.fail(E::escaping_not_utf8())?;
484    }
485
486    if let Err(_) = std::str::from_utf8(p.slice(start..p.position())) {
487        return p.fail(E::escaping_not_utf8()); // catch overlong encodings etc
488    }
489
490    match core::char::from_u32(scalar) {
491        None => return p.fail(E::escaping_not_utf8()),
492        Some(c) => {
493            if c == '\\' {
494                match p.next()? {
495                    0x22 => return Ok('\"'),
496                    0x30 => return Ok('\0'),
497                    0x5c => return Ok('\\'),
498                    0x6e => return Ok('\n'),
499                    0x74 => return Ok('\t'),
500                    0x7b => {
501                        let start = p.position();
502                        p.skip(is_hex_digit);
503                        let end = p.position();
504                        let len = end - start;
505
506                        if len < 1 || len > 6 {
507                            return p.fail(E::unicode_escape_number_digits());
508                        }
509
510                        let raw = p.slice(start..end);
511                        let numeric = u32::from_str_radix(unsafe { std::str::from_utf8_unchecked(raw) }, 16).unwrap();
512
513                        match std::char::from_u32(numeric) {
514                            None => return p.fail(E::unicode_escape_invalid_scalar()),
515                            Some(c) => {
516                                p.expect('}' as u8, E::unicode_escape_no_closing())?;
517                                return Ok(c);
518                            }
519                        }
520                    }
521                    _ => return p.fail(E::invalid_escape_sequence()),
522                }
523            } else {
524                return Ok(c);
525            }
526        }
527    }
528}
529
530fn is_at(b: u8) -> bool {
531    b == ('@' as u8)
532}
533
534fn is_digit(byte: u8) -> bool {
535    byte.is_ascii_digit()
536}
537
538fn is_hex_digit(byte: u8) -> bool {
539    byte.is_ascii_hexdigit()
540}
541
542fn is_binary_digit(byte: u8) -> bool {
543    byte == ('0' as u8) || byte == ('1' as u8)
544}
545
546fn is_digit_or_underscore(byte: u8) -> bool {
547    byte == ('_' as u8) || byte.is_ascii_digit()
548}
549
550fn is_hex_digit_or_underscore(byte: u8) -> bool {
551    byte == ('_' as u8) || is_hex_digit(byte)
552}
553
554fn is_binary_digit_or_underscore(byte: u8) -> bool {
555    byte == ('_' as u8) || is_binary_digit(byte)
556}
557
558pub fn u8_from_decimal<E: ByteStringLiteralE>(s: &str) -> Result<u8, E> {
559    u8::from_str_radix(s, 10).map_err(|_| E::byte_out_of_bounds())
560}
561
562pub fn u8_from_hex<E: ByteStringLiteralE>(s: &str) -> Result<u8, E> {
563    u8::from_str_radix(s, 16).map_err(|_| E::byte_out_of_bounds())
564}
565
566pub fn u8_from_binary<E: ByteStringLiteralE>(s: &str) -> Result<u8, E> {
567    u8::from_str_radix(s, 2).map_err(|_| E::byte_out_of_bounds())
568}