graphql_parser/
tokenizer.rs

1use std::fmt;
2
3use combine::easy::{Error, Errors, Info};
4use combine::error::StreamError;
5use combine::stream::ResetStream;
6use combine::{Positioned, StreamOnce};
7
8use crate::position::Pos;
9
10#[derive(Debug, PartialEq, Eq, Clone, Copy)]
11pub enum Kind {
12    Punctuator,
13    Name,
14    IntValue,
15    FloatValue,
16    StringValue,
17    BlockString,
18}
19
20#[derive(Debug, PartialEq, Eq, Clone, Copy)]
21pub struct Token<'a> {
22    pub kind: Kind,
23    pub value: &'a str,
24}
25
26#[derive(Debug, PartialEq)]
27pub struct TokenStream<'a> {
28    buf: &'a str,
29    position: Pos,
30    off: usize,
31    next_state: Option<(usize, Token<'a>, usize, Pos)>,
32    recursion_limit: usize,
33}
34
35impl TokenStream<'_> {
36    pub(crate) fn offset(&self) -> usize {
37        self.off
38    }
39}
40
41#[derive(Clone, Debug, PartialEq)]
42pub struct Checkpoint {
43    position: Pos,
44    off: usize,
45}
46
47impl<'a> StreamOnce for TokenStream<'a> {
48    type Token = Token<'a>;
49    type Range = Token<'a>;
50    type Position = Pos;
51    type Error = Errors<Token<'a>, Token<'a>, Pos>;
52
53    fn uncons(&mut self) -> Result<Self::Token, Error<Token<'a>, Token<'a>>> {
54        if let Some((at, tok, off, pos)) = self.next_state {
55            if at == self.off {
56                self.off = off;
57                self.position = pos;
58                return Ok(tok);
59            }
60        }
61        let old_pos = self.off;
62        let (kind, len) = self.take_token()?;
63        let value = &self.buf[self.off - len..self.off];
64        self.skip_whitespace();
65        let token = Token { kind, value };
66        self.next_state = Some((old_pos, token, self.off, self.position));
67        Ok(token)
68    }
69}
70
71impl<'a> Positioned for TokenStream<'a> {
72    fn position(&self) -> Self::Position {
73        self.position
74    }
75}
76
77impl<'a> ResetStream for TokenStream<'a> {
78    type Checkpoint = Checkpoint;
79    fn checkpoint(&self) -> Self::Checkpoint {
80        Checkpoint {
81            position: self.position,
82            off: self.off,
83        }
84    }
85    fn reset(&mut self, checkpoint: Checkpoint) -> Result<(), Self::Error> {
86        self.position = checkpoint.position;
87        self.off = checkpoint.off;
88        Ok(())
89    }
90}
91
92// NOTE: we expect that first character is always digit or minus, as returned
93// by tokenizer
94fn check_int(value: &str) -> bool {
95    value == "0"
96        || value == "-0"
97        || (!value.starts_with('0')
98            && value != "-"
99            && !value.starts_with("-0")
100            && value[1..].chars().all(|x| x.is_ascii_digit()))
101}
102
103fn check_dec(value: &str) -> bool {
104    !value.is_empty() && value.chars().all(|x| x.is_ascii_digit())
105}
106
107fn check_exp(value: &str) -> bool {
108    if value.is_empty() {
109        return false;
110    }
111    let first = value.chars().next().unwrap();
112    if first != '-' && first != '+' && (first <= '0' || first >= '9') {
113        return false;
114    }
115
116    value[1..].chars().all(|x| x.is_ascii_digit())
117}
118
119fn check_float(value: &str, exponent: Option<usize>, real: Option<usize>) -> bool {
120    match (exponent, real) {
121        (Some(e), Some(r)) if e < r => false,
122        (Some(e), Some(r)) => {
123            check_int(&value[..r]) && check_dec(&value[r + 1..e]) && check_exp(&value[e + 1..])
124        }
125        (Some(e), None) => check_int(&value[..e]) && check_exp(&value[e + 1..]),
126        (None, Some(r)) => check_int(&value[..r]) && check_dec(&value[r + 1..]),
127        (None, None) => unreachable!(),
128    }
129}
130
131impl<'a> TokenStream<'a> {
132    pub fn new(s: &str) -> TokenStream {
133        Self::with_recursion_limit(s, 50)
134    }
135
136    /// Specify a limit to recursive parsing. Note that increasing the limit
137    /// from the default may represent a security issue since a maliciously
138    /// crafted input may cause a stack overflow, crashing the process.
139    pub(crate) fn with_recursion_limit(s: &str, recursion_limit: usize) -> TokenStream {
140        let mut me = TokenStream {
141            buf: s,
142            position: Pos { line: 1, column: 1 },
143            off: 0,
144            next_state: None,
145            recursion_limit,
146        };
147        me.skip_whitespace();
148        me
149    }
150
151    /// Convenience for the common case where a token does
152    /// not span multiple lines. Infallible.
153    #[inline]
154    fn advance_token<T>(&mut self, kind: Kind, size: usize) -> Result<(Kind, usize), T> {
155        self.position.column += size;
156        self.off += size;
157        Ok((kind, size))
158    }
159
160    fn take_token(&mut self) -> Result<(Kind, usize), Error<Token<'a>, Token<'a>>> {
161        use self::Kind::*;
162        let mut iter = self.buf[self.off..].char_indices();
163        let cur_char = match iter.next() {
164            Some((_, x)) => x,
165            None => return Err(Error::end_of_input()),
166        };
167
168        match cur_char {
169            '(' | '[' | '{' => {
170                // Check for recursion limit
171                self.recursion_limit = self
172                    .recursion_limit
173                    .checked_sub(1)
174                    .ok_or_else(|| Error::message_static_message("Recursion limit exceeded"))?;
175
176                self.advance_token(Punctuator, 1)
177            }
178            ')' | ']' | '}' => {
179                // Notes on exceptional cases:
180                // recursion_limit may exceed the original value specified
181                // when constructing the Tokenizer. It may at first
182                // seem like this would be a good place to handle that,
183                // but instead this code allows this token to propagate up
184                // to the parser which is better equipped to make specific
185                // error messages about unmatched pairs.
186                // The case where recursion limit would overflow but instead
187                // saturates is just a specific case of the more general
188                // occurrence above.
189                self.recursion_limit = self.recursion_limit.saturating_add(1);
190                self.advance_token(Punctuator, 1)
191            }
192            '!' | '$' | ':' | '=' | '@' | '|' | '&' => self.advance_token(Punctuator, 1),
193            '.' => {
194                if iter.as_str().starts_with("..") {
195                    self.advance_token(Punctuator, 3)
196                } else {
197                    Err(Error::Unexpected(Info::Owned(
198                        format_args!(
199                            "bare dot {:?} is not supported, \
200                            only \"...\"",
201                            cur_char
202                        )
203                        .to_string(),
204                    )))
205                }
206            }
207            '_' | 'a'..='z' | 'A'..='Z' => {
208                for (idx, cur_char) in iter.by_ref() {
209                    match cur_char {
210                        '_' | 'a'..='z' | 'A'..='Z' | '0'..='9' => continue,
211                        _ => return self.advance_token(Name, idx),
212                    }
213                }
214                let len = self.buf.len() - self.off;
215                self.position.column += len;
216                self.off += len;
217
218                Ok((Name, len))
219            }
220            '-' | '0'..='9' => {
221                let mut exponent = None;
222                let mut real = None;
223                let len = loop {
224                    let (idx, cur_char) = match iter.next() {
225                        Some(pair) => pair,
226                        None => break self.buf.len() - self.off,
227                    };
228                    match cur_char {
229                        // just scan for now, will validate later on
230                        ' ' | '\n' | '\r' | '\t' | ',' | '#' | '!' | '$' | ':' | '=' | '@'
231                        | '|' | '&' | '(' | ')' | '[' | ']' | '{' | '}' => break idx,
232                        '.' => real = Some(idx),
233                        'e' | 'E' => exponent = Some(idx),
234                        _ => {}
235                    }
236                };
237
238                if exponent.is_some() || real.is_some() {
239                    let value = &self.buf[self.off..][..len];
240                    if !check_float(value, exponent, real) {
241                        return Err(Error::Unexpected(Info::Owned(
242                            format_args!("unsupported float {:?}", value).to_string(),
243                        )));
244                    }
245                    self.position.column += len;
246                    self.off += len;
247
248                    Ok((FloatValue, len))
249                } else {
250                    let value = &self.buf[self.off..][..len];
251                    if !check_int(value) {
252                        return Err(Error::Unexpected(Info::Owned(
253                            format_args!("unsupported integer {:?}", value).to_string(),
254                        )));
255                    }
256                    self.advance_token(IntValue, len)
257                }
258            }
259            '"' => {
260                if iter.as_str().starts_with("\"\"") {
261                    let tail = &iter.as_str()[2..];
262                    for (end_idx, _) in tail.match_indices("\"\"\"") {
263                        if !tail[..end_idx].ends_with('\\') {
264                            self.update_position(end_idx + 6);
265                            return Ok((BlockString, end_idx + 6));
266                        }
267                    }
268
269                    Err(Error::Unexpected(Info::Owned(
270                        "unterminated block string value".to_string(),
271                    )))
272                } else {
273                    let mut nchars = 1;
274                    let mut escaped = false;
275                    for (idx, cur_char) in iter {
276                        nchars += 1;
277                        match cur_char {
278                            '"' if escaped => {}
279                            '"' => {
280                                self.position.column += nchars;
281                                self.off += idx + 1;
282                                return Ok((StringValue, idx + 1));
283                            }
284                            '\n' => {
285                                return Err(Error::Unexpected(Info::Owned(
286                                    "unterminated string value".to_string(),
287                                )));
288                            }
289
290                            _ => {}
291                        }
292
293                        // if we aren't escaped and the current char is a \, we are now escaped
294                        escaped = !escaped && cur_char == '\\';
295                    }
296                    Err(Error::Unexpected(Info::Owned(
297                        "unterminated string value".to_string(),
298                    )))
299                }
300            }
301            _ => Err(Error::Unexpected(Info::Owned(
302                format_args!("unexpected character {:?}", cur_char).to_string(),
303            ))),
304        }
305    }
306
307    fn skip_whitespace(&mut self) {
308        let mut iter = self.buf[self.off..].char_indices();
309        let idx = loop {
310            let (idx, cur_char) = match iter.next() {
311                Some(pair) => pair,
312                None => break self.buf.len() - self.off,
313            };
314            match cur_char {
315                '\u{feff}' | '\r' => continue,
316                '\t' => self.position.column += 8,
317                '\n' => {
318                    self.position.column = 1;
319                    self.position.line += 1;
320                }
321                // comma is also entirely ignored in spec
322                ' ' | ',' => {
323                    self.position.column += 1;
324                    continue;
325                }
326                //comment
327                '#' => {
328                    for (_, cur_char) in iter.by_ref() {
329                        // TODO(tailhook) ensure SourceCharacter
330                        if cur_char == '\r' || cur_char == '\n' {
331                            self.position.column = 1;
332                            self.position.line += 1;
333                            break;
334                        }
335                    }
336                    continue;
337                }
338                _ => break idx,
339            }
340        };
341        self.off += idx;
342    }
343
344    fn update_position(&mut self, len: usize) {
345        let val = &self.buf[self.off..][..len];
346        self.off += len;
347        let lines = val.as_bytes().iter().filter(|&&x| x == b'\n').count();
348        self.position.line += lines;
349        if lines > 0 {
350            let line_offset = val.rfind('\n').unwrap() + 1;
351            let num = val[line_offset..].chars().count();
352            self.position.column = num + 1;
353        } else {
354            let num = val.chars().count();
355            self.position.column += num;
356        }
357    }
358}
359
360impl<'a> fmt::Display for Token<'a> {
361    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
362        write!(f, "{}[{:?}]", self.value, self.kind)
363    }
364}
365
366#[cfg(test)]
367mod test {
368    use super::Kind::*;
369    use super::{Kind, TokenStream};
370    use combine::easy::Error;
371
372    use combine::{Positioned, StreamOnce};
373
374    fn tok_str(s: &str) -> Vec<&str> {
375        let mut r = Vec::new();
376        let mut s = TokenStream::new(s);
377        loop {
378            match s.uncons() {
379                Ok(x) => r.push(x.value),
380                Err(ref e) if e == &Error::end_of_input() => break,
381                Err(e) => panic!("Parse error at {}: {}", s.position(), e),
382            }
383        }
384        r
385    }
386    fn tok_typ(s: &str) -> Vec<Kind> {
387        let mut r = Vec::new();
388        let mut s = TokenStream::new(s);
389        loop {
390            match s.uncons() {
391                Ok(x) => r.push(x.kind),
392                Err(ref e) if e == &Error::end_of_input() => break,
393                Err(e) => panic!("Parse error at {}: {}", s.position(), e),
394            }
395        }
396        r
397    }
398
399    #[test]
400    fn comments_and_commas() {
401        assert_eq!(tok_str("# hello { world }"), &[] as &[&str]);
402        assert_eq!(tok_str("# x\n,,,"), &[] as &[&str]);
403        assert_eq!(tok_str(", ,,  ,,,  # x"), &[] as &[&str]);
404    }
405
406    #[test]
407    fn simple() {
408        assert_eq!(tok_str("a { b }"), ["a", "{", "b", "}"]);
409        assert_eq!(tok_typ("a { b }"), [Name, Punctuator, Name, Punctuator]);
410    }
411
412    #[test]
413    fn query() {
414        assert_eq!(
415            tok_str(
416                "query Query {
417            object { field }
418        }"
419            ),
420            ["query", "Query", "{", "object", "{", "field", "}", "}"]
421        );
422    }
423
424    #[test]
425    fn fragment() {
426        assert_eq!(tok_str("a { ...b }"), ["a", "{", "...", "b", "}"]);
427    }
428
429    #[test]
430    fn int() {
431        assert_eq!(tok_str("0"), ["0"]);
432        assert_eq!(tok_str("0,"), ["0"]);
433        assert_eq!(tok_str("0# x"), ["0"]);
434        assert_eq!(tok_typ("0"), [IntValue]);
435        assert_eq!(tok_str("-0"), ["-0"]);
436        assert_eq!(tok_typ("-0"), [IntValue]);
437        assert_eq!(tok_str("-1"), ["-1"]);
438        assert_eq!(tok_typ("-1"), [IntValue]);
439        assert_eq!(tok_str("-132"), ["-132"]);
440        assert_eq!(tok_typ("-132"), [IntValue]);
441        assert_eq!(tok_str("132"), ["132"]);
442        assert_eq!(tok_typ("132"), [IntValue]);
443        assert_eq!(
444            tok_str("a(x: 10) { b }"),
445            ["a", "(", "x", ":", "10", ")", "{", "b", "}"]
446        );
447        assert_eq!(
448            tok_typ("a(x: 10) { b }"),
449            [
450                Name, Punctuator, Name, Punctuator, IntValue, Punctuator, Punctuator, Name,
451                Punctuator
452            ]
453        );
454    }
455
456    // TODO(tailhook) fix errors in parser and check error message
457    #[test]
458    #[should_panic]
459    fn zero_int() {
460        tok_str("01");
461    }
462    #[test]
463    #[should_panic]
464    fn zero_int4() {
465        tok_str("00001");
466    }
467    #[test]
468    #[should_panic]
469    fn minus_int() {
470        tok_str("-");
471    }
472    #[test]
473    #[should_panic]
474    fn minus_zero_int() {
475        tok_str("-01");
476    }
477    #[test]
478    #[should_panic]
479    fn minus_zero_int4() {
480        tok_str("-00001");
481    }
482    #[test]
483    #[should_panic]
484    fn letters_int() {
485        tok_str("0bbc");
486    }
487
488    #[test]
489    fn float() {
490        assert_eq!(tok_str("0.0"), ["0.0"]);
491        assert_eq!(tok_typ("0.0"), [FloatValue]);
492        assert_eq!(tok_str("-0.0"), ["-0.0"]);
493        assert_eq!(tok_typ("-0.0"), [FloatValue]);
494        assert_eq!(tok_str("-1.0"), ["-1.0"]);
495        assert_eq!(tok_typ("-1.0"), [FloatValue]);
496        assert_eq!(tok_str("-1.023"), ["-1.023"]);
497        assert_eq!(tok_typ("-1.023"), [FloatValue]);
498        assert_eq!(tok_str("-132.0"), ["-132.0"]);
499        assert_eq!(tok_typ("-132.0"), [FloatValue]);
500        assert_eq!(tok_str("132.0"), ["132.0"]);
501        assert_eq!(tok_typ("132.0"), [FloatValue]);
502        assert_eq!(tok_str("0e+0"), ["0e+0"]);
503        assert_eq!(tok_typ("0e+0"), [FloatValue]);
504        assert_eq!(tok_str("0.0e+0"), ["0.0e+0"]);
505        assert_eq!(tok_typ("0.0e+0"), [FloatValue]);
506        assert_eq!(tok_str("-0e+0"), ["-0e+0"]);
507        assert_eq!(tok_typ("-0e+0"), [FloatValue]);
508        assert_eq!(tok_str("-1e+0"), ["-1e+0"]);
509        assert_eq!(tok_typ("-1e+0"), [FloatValue]);
510        assert_eq!(tok_str("-132e+0"), ["-132e+0"]);
511        assert_eq!(tok_typ("-132e+0"), [FloatValue]);
512        assert_eq!(tok_str("132e+0"), ["132e+0"]);
513        assert_eq!(tok_typ("132e+0"), [FloatValue]);
514        assert_eq!(
515            tok_str("a(x: 10.0) { b }"),
516            ["a", "(", "x", ":", "10.0", ")", "{", "b", "}"]
517        );
518        assert_eq!(
519            tok_typ("a(x: 10.0) { b }"),
520            [
521                Name, Punctuator, Name, Punctuator, FloatValue, Punctuator, Punctuator, Name,
522                Punctuator
523            ]
524        );
525        assert_eq!(tok_str("1.23e4"), ["1.23e4"]);
526        assert_eq!(tok_typ("1.23e4"), [FloatValue]);
527    }
528
529    // TODO(tailhook) fix errors in parser and check error message
530    #[test]
531    #[should_panic]
532    fn no_int_float() {
533        tok_str(".0");
534    }
535    #[test]
536    #[should_panic]
537    fn no_int_float1() {
538        tok_str(".1");
539    }
540    #[test]
541    #[should_panic]
542    fn zero_float() {
543        tok_str("01.0");
544    }
545    #[test]
546    #[should_panic]
547    fn zero_float4() {
548        tok_str("00001.0");
549    }
550    #[test]
551    #[should_panic]
552    fn minus_float() {
553        tok_str("-.0");
554    }
555    #[test]
556    #[should_panic]
557    fn minus_zero_float() {
558        tok_str("-01.0");
559    }
560    #[test]
561    #[should_panic]
562    fn minus_zero_float4() {
563        tok_str("-00001.0");
564    }
565    #[test]
566    #[should_panic]
567    fn letters_float() {
568        tok_str("0bbc.0");
569    }
570    #[test]
571    #[should_panic]
572    fn letters_float2() {
573        tok_str("0.bbc");
574    }
575    #[test]
576    #[should_panic]
577    fn letters_float3() {
578        tok_str("0.bbce0");
579    }
580    #[test]
581    #[should_panic]
582    fn no_exp_sign_float() {
583        tok_str("0e0");
584    }
585    #[test]
586    #[should_panic]
587    fn unterminated_string() {
588        tok_str(r#""hello\""#);
589    }
590    #[test]
591    #[should_panic]
592    fn extra_unterminated_string() {
593        tok_str(r#""hello\\\""#);
594    }
595
596    #[test]
597    fn string() {
598        assert_eq!(tok_str(r#""""#), [r#""""#]);
599        assert_eq!(tok_typ(r#""""#), [StringValue]);
600        assert_eq!(tok_str(r#""hello""#), [r#""hello""#]);
601        assert_eq!(tok_str(r#""hello\\""#), [r#""hello\\""#]);
602        assert_eq!(tok_str(r#""hello\\\\""#), [r#""hello\\\\""#]);
603        assert_eq!(tok_str(r#""he\\llo""#), [r#""he\\llo""#]);
604        assert_eq!(tok_typ(r#""hello""#), [StringValue]);
605        assert_eq!(tok_str(r#""my\"quote""#), [r#""my\"quote""#]);
606        assert_eq!(tok_typ(r#""my\"quote""#), [StringValue]);
607    }
608
609    #[test]
610    fn block_string() {
611        assert_eq!(tok_str(r#""""""""#), [r#""""""""#]);
612        assert_eq!(tok_typ(r#""""""""#), [BlockString]);
613        assert_eq!(tok_str(r#""""hello""""#), [r#""""hello""""#]);
614        assert_eq!(tok_typ(r#""""hello""""#), [BlockString]);
615        assert_eq!(tok_str(r#""""my "quote" """"#), [r#""""my "quote" """"#]);
616        assert_eq!(tok_typ(r#""""my "quote" """"#), [BlockString]);
617        assert_eq!(tok_str(r#""""\"""quote" """"#), [r#""""\"""quote" """"#]);
618        assert_eq!(tok_typ(r#""""\"""quote" """"#), [BlockString]);
619    }
620}