gurkle_parser/
tokenizer.rs

1use std::fmt;
2
3use combine::easy::{Error, Errors};
4use combine::error::StreamError;
5use combine::stream::Resetable;
6use combine::{Positioned, StreamOnce};
7
8use crate::position::Pos;
9
10#[derive(Debug, PartialEq, Eq, Clone, Copy)]
11pub enum Kind {
12    Punctuator,
13    Name,
14    IntValue,
15    FloatValue,
16    StringValue,
17    BlockString,
18}
19
20#[derive(Debug, PartialEq, Eq, Clone, Copy)]
21pub struct Token<'a> {
22    pub kind: Kind,
23    pub value: &'a str,
24}
25
26#[derive(Debug, PartialEq)]
27pub struct TokenStream<'a> {
28    buf: &'a str,
29    position: Pos,
30    off: usize,
31    next_state: Option<(usize, Token<'a>, usize, Pos)>,
32    recursion_limit: usize,
33}
34
35impl TokenStream<'_> {
36    pub(crate) fn offset(&self) -> usize {
37        self.off
38    }
39}
40
41#[derive(Clone, Debug, PartialEq)]
42pub struct Checkpoint {
43    position: Pos,
44    off: usize,
45}
46
47impl<'a> StreamOnce for TokenStream<'a> {
48    type Item = Token<'a>;
49    type Range = Token<'a>;
50    type Position = Pos;
51    type Error = Errors<Token<'a>, Token<'a>, Pos>;
52
53    fn uncons(&mut self) -> Result<Self::Item, Error<Token<'a>, Token<'a>>> {
54        if let Some((at, tok, off, pos)) = self.next_state {
55            if at == self.off {
56                self.off = off;
57                self.position = pos;
58                return Ok(tok);
59            }
60        }
61        let old_pos = self.off;
62        let (kind, len) = self.take_token()?;
63        let value = &self.buf[self.off - len..self.off];
64        self.skip_whitespace();
65        let token = Token { kind, value };
66        self.next_state = Some((old_pos, token, self.off, self.position));
67        Ok(token)
68    }
69}
70
71impl<'a> Positioned for TokenStream<'a> {
72    fn position(&self) -> Self::Position {
73        self.position
74    }
75}
76
77impl<'a> Resetable for TokenStream<'a> {
78    type Checkpoint = Checkpoint;
79    fn checkpoint(&self) -> Self::Checkpoint {
80        Checkpoint {
81            position: self.position,
82            off: self.off,
83        }
84    }
85    fn reset(&mut self, checkpoint: Checkpoint) {
86        self.position = checkpoint.position;
87        self.off = checkpoint.off;
88    }
89}
90
91// NOTE: we expect that first character is always digit or minus, as returned
92// by tokenizer
93fn check_int(value: &str) -> bool {
94    value == "0"
95        || value == "-0"
96        || (!value.starts_with('0')
97            && value != "-"
98            && !value.starts_with("-0")
99            && value[1..].chars().all(|x| x >= '0' && x <= '9'))
100}
101
102fn check_dec(value: &str) -> bool {
103    !value.is_empty() && value.chars().all(|x| x >= '0' && x <= '9')
104}
105
106fn check_exp(value: &str) -> bool {
107    if value.is_empty() {
108        return false;
109    }
110    let first = value.chars().next().unwrap();
111    if first != '-' && first != '+' && (first <= '0' || first >= '9') {
112        return false;
113    }
114
115    value[1..].chars().all(|x| x >= '0' && x <= '9')
116}
117
118fn check_float(value: &str, exponent: Option<usize>, real: Option<usize>) -> bool {
119    match (exponent, real) {
120        (Some(e), Some(r)) if e < r => false,
121        (Some(e), Some(r)) => {
122            check_int(&value[..r]) && check_dec(&value[r + 1..e]) && check_exp(&value[e + 1..])
123        }
124        (Some(e), None) => check_int(&value[..e]) && check_exp(&value[e + 1..]),
125        (None, Some(r)) => check_int(&value[..r]) && check_dec(&value[r + 1..]),
126        (None, None) => unreachable!(),
127    }
128}
129
130impl<'a> TokenStream<'a> {
131    pub fn new(s: &str) -> TokenStream {
132        Self::with_recursion_limit(s, 50)
133    }
134
135    /// Specify a limit to recursive parsing. Note that increasing the limit
136    /// from the default may represent a security issue since a maliciously
137    /// crafted input may cause a stack overflow, crashing the process.
138    pub(crate) fn with_recursion_limit(s: &str, recursion_limit: usize) -> TokenStream {
139        let mut me = TokenStream {
140            buf: s,
141            position: Pos { line: 1, column: 1 },
142            off: 0,
143            next_state: None,
144            recursion_limit,
145        };
146        me.skip_whitespace();
147        me
148    }
149
150    /// Convenience for the common case where a token does
151    /// not span multiple lines. Infallible.
152    #[inline]
153    fn advance_token<T>(&mut self, kind: Kind, size: usize) -> Result<(Kind, usize), T> {
154        self.position.column += size;
155        self.off += size;
156        Ok((kind, size))
157    }
158
159    fn take_token(&mut self) -> Result<(Kind, usize), Error<Token<'a>, Token<'a>>> {
160        use self::Kind::*;
161        let mut iter = self.buf[self.off..].char_indices();
162        let cur_char = match iter.next() {
163            Some((_, x)) => x,
164            None => return Err(Error::end_of_input()),
165        };
166
167        match cur_char {
168            '(' | '[' | '{' => {
169                // Check for recursion limit
170                self.recursion_limit = self
171                    .recursion_limit
172                    .checked_sub(1)
173                    .ok_or_else(|| Error::message_static_message("Recursion limit exceeded"))?;
174
175                self.advance_token(Punctuator, 1)
176            }
177            ')' | ']' | '}' => {
178                // Notes on exceptional cases:
179                // recursion_limit may exceed the original value specified
180                // when constructing the Tokenizer. It may at first
181                // seem like this would be a good place to handle that,
182                // but instead this code allows this token to propagate up
183                // to the parser which is better equipped to make specific
184                // error messages about unmatched pairs.
185                // The case where recursion limit would overflow but instead
186                // saturates is just a specific case of the more general
187                // occurrence above.
188                self.recursion_limit = self.recursion_limit.saturating_add(1);
189                self.advance_token(Punctuator, 1)
190            }
191            '!' | '$' | ':' | '=' | '@' | '|' | '&' => self.advance_token(Punctuator, 1),
192            '.' => {
193                if iter.as_str().starts_with("..") {
194                    self.advance_token(Punctuator, 3)
195                } else {
196                    Err(Error::unexpected_message(format_args!(
197                        "bare dot {:?} is not supported, \
198                            only \"...\"",
199                        cur_char
200                    )))
201                }
202            }
203            '_' | 'a'..='z' | 'A'..='Z' => {
204                while let Some((idx, cur_char)) = iter.next() {
205                    match cur_char {
206                        '_' | 'a'..='z' | 'A'..='Z' | '0'..='9' => continue,
207                        _ => return self.advance_token(Name, idx),
208                    }
209                }
210                let len = self.buf.len() - self.off;
211                self.position.column += len;
212                self.off += len;
213
214                Ok((Name, len))
215            }
216            '-' | '0'..='9' => {
217                let mut exponent = None;
218                let mut real = None;
219                let len = loop {
220                    let (idx, cur_char) = match iter.next() {
221                        Some(pair) => pair,
222                        None => break self.buf.len() - self.off,
223                    };
224                    match cur_char {
225                        // just scan for now, will validate later on
226                        ' ' | '\n' | '\r' | '\t' | ',' | '#' | '!' | '$' | ':' | '=' | '@'
227                        | '|' | '&' | '(' | ')' | '[' | ']' | '{' | '}' => break idx,
228                        '.' => real = Some(idx),
229                        'e' | 'E' => exponent = Some(idx),
230                        _ => {}
231                    }
232                };
233
234                if exponent.is_some() || real.is_some() {
235                    let value = &self.buf[self.off..][..len];
236                    if !check_float(value, exponent, real) {
237                        return Err(Error::unexpected_message(format_args!(
238                            "unsupported float {:?}",
239                            value
240                        )));
241                    }
242                    self.position.column += len;
243                    self.off += len;
244
245                    Ok((FloatValue, len))
246                } else {
247                    let value = &self.buf[self.off..][..len];
248                    if !check_int(value) {
249                        return Err(Error::unexpected_message(format_args!(
250                            "unsupported integer {:?}",
251                            value
252                        )));
253                    }
254                    self.advance_token(IntValue, len)
255                }
256            }
257            '"' => {
258                if iter.as_str().starts_with("\"\"") {
259                    let tail = &iter.as_str()[2..];
260                    for (end_idx, _) in tail.match_indices("\"\"\"") {
261                        if !tail[..end_idx].ends_with('\\') {
262                            self.update_position(end_idx + 6);
263                            return Ok((BlockString, end_idx + 6));
264                        }
265                    }
266
267                    Err(Error::unexpected_message("unterminated block string value"))
268                } else {
269                    let mut nchars = 1;
270                    let mut escaped = false;
271                    for (idx, cur_char) in iter {
272                        nchars += 1;
273                        match cur_char {
274                            '"' if escaped => {}
275                            '"' => {
276                                self.position.column += nchars;
277                                self.off += idx + 1;
278                                return Ok((StringValue, idx + 1));
279                            }
280                            '\n' => {
281                                return Err(Error::unexpected_message("unterminated string value"));
282                            }
283
284                            _ => {}
285                        }
286
287                        // if we aren't escaped and the current char is a \, we are now escaped
288                        escaped = !escaped && cur_char == '\\';
289                    }
290                    Err(Error::unexpected_message("unterminated string value"))
291                }
292            }
293            _ => Err(Error::unexpected_message(format_args!(
294                "unexpected character {:?}",
295                cur_char
296            ))),
297        }
298    }
299
300    fn skip_whitespace(&mut self) {
301        let mut iter = self.buf[self.off..].char_indices();
302        let idx = loop {
303            let (idx, cur_char) = match iter.next() {
304                Some(pair) => pair,
305                None => break self.buf.len() - self.off,
306            };
307            match cur_char {
308                '\u{feff}' | '\r' => continue,
309                '\t' => self.position.column += 8,
310                '\n' => {
311                    self.position.column = 1;
312                    self.position.line += 1;
313                }
314                // comma is also entirely ignored in spec
315                ' ' | ',' => {
316                    self.position.column += 1;
317                    continue;
318                }
319                //comment
320                '#' => {
321                    while let Some((_, cur_char)) = iter.next() {
322                        // TODO(tailhook) ensure SourceCharacter
323                        if cur_char == '\r' || cur_char == '\n' {
324                            self.position.column = 1;
325                            self.position.line += 1;
326                            break;
327                        }
328                    }
329                    continue;
330                }
331                _ => break idx,
332            }
333        };
334        self.off += idx;
335    }
336
337    fn update_position(&mut self, len: usize) {
338        let val = &self.buf[self.off..][..len];
339        self.off += len;
340        let lines = val.as_bytes().iter().filter(|&&x| x == b'\n').count();
341        self.position.line += lines;
342        if lines > 0 {
343            let line_offset = val.rfind('\n').unwrap() + 1;
344            let num = val[line_offset..].chars().count();
345            self.position.column = num + 1;
346        } else {
347            let num = val.chars().count();
348            self.position.column += num;
349        }
350    }
351}
352
353impl<'a> fmt::Display for Token<'a> {
354    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
355        write!(f, "{}[{:?}]", self.value, self.kind)
356    }
357}
358
359#[cfg(test)]
360mod test {
361    use super::Kind::*;
362    use super::{Kind, TokenStream};
363    use combine::easy::Error;
364
365    use combine::{Positioned, StreamOnce};
366
367    fn tok_str(s: &str) -> Vec<&str> {
368        let mut r = Vec::new();
369        let mut s = TokenStream::new(s);
370        loop {
371            match s.uncons() {
372                Ok(x) => r.push(x.value),
373                Err(ref e) if e == &Error::end_of_input() => break,
374                Err(e) => panic!("Parse error at {}: {}", s.position(), e),
375            }
376        }
377        return r;
378    }
379    fn tok_typ(s: &str) -> Vec<Kind> {
380        let mut r = Vec::new();
381        let mut s = TokenStream::new(s);
382        loop {
383            match s.uncons() {
384                Ok(x) => r.push(x.kind),
385                Err(ref e) if e == &Error::end_of_input() => break,
386                Err(e) => panic!("Parse error at {}: {}", s.position(), e),
387            }
388        }
389        return r;
390    }
391
392    #[test]
393    fn comments_and_commas() {
394        assert_eq!(tok_str("# hello { world }"), &[] as &[&str]);
395        assert_eq!(tok_str("# x\n,,,"), &[] as &[&str]);
396        assert_eq!(tok_str(", ,,  ,,,  # x"), &[] as &[&str]);
397    }
398
399    #[test]
400    fn simple() {
401        assert_eq!(tok_str("a { b }"), ["a", "{", "b", "}"]);
402        assert_eq!(tok_typ("a { b }"), [Name, Punctuator, Name, Punctuator]);
403    }
404
405    #[test]
406    fn query() {
407        assert_eq!(
408            tok_str(
409                "query Query {
410            object { field }
411        }"
412            ),
413            ["query", "Query", "{", "object", "{", "field", "}", "}"]
414        );
415    }
416
417    #[test]
418    fn fragment() {
419        assert_eq!(tok_str("a { ...b }"), ["a", "{", "...", "b", "}"]);
420    }
421
422    #[test]
423    fn int() {
424        assert_eq!(tok_str("0"), ["0"]);
425        assert_eq!(tok_str("0,"), ["0"]);
426        assert_eq!(tok_str("0# x"), ["0"]);
427        assert_eq!(tok_typ("0"), [IntValue]);
428        assert_eq!(tok_str("-0"), ["-0"]);
429        assert_eq!(tok_typ("-0"), [IntValue]);
430        assert_eq!(tok_str("-1"), ["-1"]);
431        assert_eq!(tok_typ("-1"), [IntValue]);
432        assert_eq!(tok_str("-132"), ["-132"]);
433        assert_eq!(tok_typ("-132"), [IntValue]);
434        assert_eq!(tok_str("132"), ["132"]);
435        assert_eq!(tok_typ("132"), [IntValue]);
436        assert_eq!(
437            tok_str("a(x: 10) { b }"),
438            ["a", "(", "x", ":", "10", ")", "{", "b", "}"]
439        );
440        assert_eq!(
441            tok_typ("a(x: 10) { b }"),
442            [
443                Name, Punctuator, Name, Punctuator, IntValue, Punctuator, Punctuator, Name,
444                Punctuator
445            ]
446        );
447    }
448
449    // TODO(tailhook) fix errors in parser and check error message
450    #[test]
451    #[should_panic]
452    fn zero_int() {
453        tok_str("01");
454    }
455    #[test]
456    #[should_panic]
457    fn zero_int4() {
458        tok_str("00001");
459    }
460    #[test]
461    #[should_panic]
462    fn minus_int() {
463        tok_str("-");
464    }
465    #[test]
466    #[should_panic]
467    fn minus_zero_int() {
468        tok_str("-01");
469    }
470    #[test]
471    #[should_panic]
472    fn minus_zero_int4() {
473        tok_str("-00001");
474    }
475    #[test]
476    #[should_panic]
477    fn letters_int() {
478        tok_str("0bbc");
479    }
480
481    #[test]
482    fn float() {
483        assert_eq!(tok_str("0.0"), ["0.0"]);
484        assert_eq!(tok_typ("0.0"), [FloatValue]);
485        assert_eq!(tok_str("-0.0"), ["-0.0"]);
486        assert_eq!(tok_typ("-0.0"), [FloatValue]);
487        assert_eq!(tok_str("-1.0"), ["-1.0"]);
488        assert_eq!(tok_typ("-1.0"), [FloatValue]);
489        assert_eq!(tok_str("-1.023"), ["-1.023"]);
490        assert_eq!(tok_typ("-1.023"), [FloatValue]);
491        assert_eq!(tok_str("-132.0"), ["-132.0"]);
492        assert_eq!(tok_typ("-132.0"), [FloatValue]);
493        assert_eq!(tok_str("132.0"), ["132.0"]);
494        assert_eq!(tok_typ("132.0"), [FloatValue]);
495        assert_eq!(tok_str("0e+0"), ["0e+0"]);
496        assert_eq!(tok_typ("0e+0"), [FloatValue]);
497        assert_eq!(tok_str("0.0e+0"), ["0.0e+0"]);
498        assert_eq!(tok_typ("0.0e+0"), [FloatValue]);
499        assert_eq!(tok_str("-0e+0"), ["-0e+0"]);
500        assert_eq!(tok_typ("-0e+0"), [FloatValue]);
501        assert_eq!(tok_str("-1e+0"), ["-1e+0"]);
502        assert_eq!(tok_typ("-1e+0"), [FloatValue]);
503        assert_eq!(tok_str("-132e+0"), ["-132e+0"]);
504        assert_eq!(tok_typ("-132e+0"), [FloatValue]);
505        assert_eq!(tok_str("132e+0"), ["132e+0"]);
506        assert_eq!(tok_typ("132e+0"), [FloatValue]);
507        assert_eq!(
508            tok_str("a(x: 10.0) { b }"),
509            ["a", "(", "x", ":", "10.0", ")", "{", "b", "}"]
510        );
511        assert_eq!(
512            tok_typ("a(x: 10.0) { b }"),
513            [
514                Name, Punctuator, Name, Punctuator, FloatValue, Punctuator, Punctuator, Name,
515                Punctuator
516            ]
517        );
518        assert_eq!(tok_str("1.23e4"), ["1.23e4"]);
519        assert_eq!(tok_typ("1.23e4"), [FloatValue]);
520    }
521
522    // TODO(tailhook) fix errors in parser and check error message
523    #[test]
524    #[should_panic]
525    fn no_int_float() {
526        tok_str(".0");
527    }
528    #[test]
529    #[should_panic]
530    fn no_int_float1() {
531        tok_str(".1");
532    }
533    #[test]
534    #[should_panic]
535    fn zero_float() {
536        tok_str("01.0");
537    }
538    #[test]
539    #[should_panic]
540    fn zero_float4() {
541        tok_str("00001.0");
542    }
543    #[test]
544    #[should_panic]
545    fn minus_float() {
546        tok_str("-.0");
547    }
548    #[test]
549    #[should_panic]
550    fn minus_zero_float() {
551        tok_str("-01.0");
552    }
553    #[test]
554    #[should_panic]
555    fn minus_zero_float4() {
556        tok_str("-00001.0");
557    }
558    #[test]
559    #[should_panic]
560    fn letters_float() {
561        tok_str("0bbc.0");
562    }
563    #[test]
564    #[should_panic]
565    fn letters_float2() {
566        tok_str("0.bbc");
567    }
568    #[test]
569    #[should_panic]
570    fn letters_float3() {
571        tok_str("0.bbce0");
572    }
573    #[test]
574    #[should_panic]
575    fn no_exp_sign_float() {
576        tok_str("0e0");
577    }
578    #[test]
579    #[should_panic]
580    fn unterminated_string() {
581        tok_str(r#""hello\""#);
582    }
583    #[test]
584    #[should_panic]
585    fn extra_unterminated_string() {
586        tok_str(r#""hello\\\""#);
587    }
588
589    #[test]
590    fn string() {
591        assert_eq!(tok_str(r#""""#), [r#""""#]);
592        assert_eq!(tok_typ(r#""""#), [StringValue]);
593        assert_eq!(tok_str(r#""hello""#), [r#""hello""#]);
594        assert_eq!(tok_str(r#""hello\\""#), [r#""hello\\""#]);
595        assert_eq!(tok_str(r#""hello\\\\""#), [r#""hello\\\\""#]);
596        assert_eq!(tok_str(r#""he\\llo""#), [r#""he\\llo""#]);
597        assert_eq!(tok_typ(r#""hello""#), [StringValue]);
598        assert_eq!(tok_str(r#""my\"quote""#), [r#""my\"quote""#]);
599        assert_eq!(tok_typ(r#""my\"quote""#), [StringValue]);
600    }
601
602    #[test]
603    fn block_string() {
604        assert_eq!(tok_str(r#""""""""#), [r#""""""""#]);
605        assert_eq!(tok_typ(r#""""""""#), [BlockString]);
606        assert_eq!(tok_str(r#""""hello""""#), [r#""""hello""""#]);
607        assert_eq!(tok_typ(r#""""hello""""#), [BlockString]);
608        assert_eq!(tok_str(r#""""my "quote" """"#), [r#""""my "quote" """"#]);
609        assert_eq!(tok_typ(r#""""my "quote" """"#), [BlockString]);
610        assert_eq!(tok_str(r#""""\"""quote" """"#), [r#""""\"""quote" """"#]);
611        assert_eq!(tok_typ(r#""""\"""quote" """"#), [BlockString]);
612    }
613}