apollo_parser/lexer/
mod.rs

1mod cursor;
2mod lookup;
3mod token;
4mod token_kind;
5
6use crate::lexer::cursor::Cursor;
7use crate::Error;
8use crate::LimitTracker;
9pub use token::Token;
10pub use token_kind::TokenKind;
11
12/// Parses GraphQL source text into tokens.
13/// ```rust
14/// use apollo_parser::Lexer;
15///
16/// let query = "
17/// {
18///     animal
19///     ...snackSelection
20///     ... on Pet {
21///       playmates {
22///         count
23///       }
24///     }
25/// }
26/// ";
27/// let (tokens, errors) = Lexer::new(query).lex();
28/// assert_eq!(errors.len(), 0);
29/// ```
30#[derive(Clone, Debug)]
31pub struct Lexer<'a> {
32    finished: bool,
33    cursor: Cursor<'a>,
34    pub(crate) limit_tracker: LimitTracker,
35}
36
37#[derive(Debug)]
38enum State {
39    Start,
40    Ident,
41    StringLiteralEscapedUnicode(usize),
42    StringLiteral,
43    StringLiteralStart,
44    BlockStringLiteral,
45    BlockStringLiteralBackslash,
46    StringLiteralBackslash,
47    LeadingZero,
48    IntegerPart,
49    DecimalPoint,
50    FractionalPart,
51    ExponentIndicator,
52    ExponentSign,
53    ExponentDigit,
54    Whitespace,
55    Comment,
56    SpreadOperator,
57    MinusSign,
58}
59
60impl<'a> Lexer<'a> {
61    /// Create a lexer for a GraphQL source text.
62    ///
63    /// The Lexer is an iterator over tokens and errors:
64    /// ```rust
65    /// use apollo_parser::Lexer;
66    ///
67    /// let query = "# --- GraphQL here ---";
68    ///
69    /// let mut lexer = Lexer::new(query);
70    /// let mut tokens = vec![];
71    /// for token in lexer {
72    ///     match token {
73    ///         Ok(token) => tokens.push(token),
74    ///         Err(error) => panic!("{:?}", error),
75    ///     }
76    /// }
77    /// ```
78    pub fn new(input: &'a str) -> Self {
79        Self {
80            cursor: Cursor::new(input),
81            finished: false,
82            limit_tracker: LimitTracker::new(usize::MAX),
83        }
84    }
85
86    pub fn with_limit(mut self, limit: usize) -> Self {
87        self.limit_tracker = LimitTracker::new(limit);
88        self
89    }
90
91    /// Lex the full source text, consuming the lexer.
92    pub fn lex(self) -> (Vec<Token<'a>>, Vec<Error>) {
93        let mut tokens = vec![];
94        let mut errors = vec![];
95
96        for item in self {
97            match item {
98                Ok(token) => tokens.push(token),
99                Err(error) => errors.push(error),
100            }
101        }
102
103        (tokens, errors)
104    }
105}
106
107impl<'a> Iterator for Lexer<'a> {
108    type Item = Result<Token<'a>, Error>;
109
110    fn next(&mut self) -> Option<Self::Item> {
111        if self.finished {
112            return None;
113        }
114
115        if self.limit_tracker.check_and_increment() {
116            self.finished = true;
117            return Some(Err(Error::limit(
118                "token limit reached, aborting lexing",
119                self.cursor.index(),
120            )));
121        }
122
123        match self.cursor.advance() {
124            Ok(token) => {
125                if matches!(token.kind(), TokenKind::Eof) {
126                    self.finished = true;
127                }
128
129                Some(Ok(token))
130            }
131            Err(err) => Some(Err(err)),
132        }
133    }
134}
135
136impl<'a> Cursor<'a> {
137    fn advance(&mut self) -> Result<Token<'a>, Error> {
138        let mut state = State::Start;
139        let mut token = Token {
140            kind: TokenKind::Eof,
141            data: "",
142            index: self.index(),
143        };
144
145        loop {
146            let Some(c) = self.bump() else {
147                return self.eof(state, token);
148            };
149            match state {
150                State::Start => {
151                    if let Some(t) = lookup::punctuation_kind(c) {
152                        token.kind = t;
153                        token.data = self.current_str();
154                        return Ok(token);
155                    }
156
157                    if lookup::is_namestart(c) {
158                        token.kind = TokenKind::Name;
159                        state = State::Ident;
160
161                        continue;
162                    }
163
164                    if c != '0' && c.is_ascii_digit() {
165                        token.kind = TokenKind::Int;
166                        state = State::IntegerPart;
167
168                        continue;
169                    }
170
171                    match c {
172                        '"' => {
173                            token.kind = TokenKind::StringValue;
174                            state = State::StringLiteralStart;
175                        }
176                        '#' => {
177                            token.kind = TokenKind::Comment;
178                            state = State::Comment;
179                        }
180                        '.' => {
181                            token.kind = TokenKind::Spread;
182                            state = State::SpreadOperator;
183                        }
184                        '-' => {
185                            token.kind = TokenKind::Int;
186                            state = State::MinusSign;
187                        }
188                        '0' => {
189                            token.kind = TokenKind::Int;
190                            state = State::LeadingZero;
191                        }
192                        c if is_whitespace_assimilated(c) => {
193                            token.kind = TokenKind::Whitespace;
194                            state = State::Whitespace;
195                        }
196                        c => {
197                            return Err(Error::with_loc(
198                                format!("Unexpected character \"{}\"", c),
199                                self.current_str().to_string(),
200                                token.index,
201                            ));
202                        }
203                    };
204                }
205                State::Ident => match c {
206                    curr if is_name_continue(curr) => {}
207                    _ => {
208                        token.data = self.prev_str();
209                        return self.done(token);
210                    }
211                },
212                State::Whitespace => match c {
213                    curr if is_whitespace_assimilated(curr) => {}
214                    _ => {
215                        token.data = self.prev_str();
216                        return self.done(token);
217                    }
218                },
219                State::BlockStringLiteral => match c {
220                    '\\' => {
221                        state = State::BlockStringLiteralBackslash;
222                    }
223                    '"' => {
224                        // Require two additional quotes to complete the triple quote.
225                        if self.eatc('"') && self.eatc('"') {
226                            token.data = self.current_str();
227                            return self.done(token);
228                        }
229                    }
230                    _ => {}
231                },
232                State::StringLiteralStart => match c {
233                    '"' => {
234                        if self.eatc('"') {
235                            state = State::BlockStringLiteral;
236
237                            continue;
238                        }
239
240                        if self.is_pending() {
241                            token.data = self.prev_str();
242                        } else {
243                            token.data = self.current_str();
244                        }
245                        return self.done(token);
246                    }
247                    '\\' => {
248                        state = State::StringLiteralBackslash;
249                    }
250                    _ => {
251                        state = State::StringLiteral;
252
253                        continue;
254                    }
255                },
256                State::StringLiteralEscapedUnicode(remaining) => match c {
257                    '"' => {
258                        self.add_err(Error::with_loc(
259                            "incomplete unicode escape sequence",
260                            c.to_string(),
261                            token.index,
262                        ));
263                        token.data = self.current_str();
264                        return self.done(token);
265                    }
266                    c if !c.is_ascii_hexdigit() => {
267                        self.add_err(Error::with_loc(
268                            "invalid unicode escape sequence",
269                            c.to_string(),
270                            0,
271                        ));
272                        state = State::StringLiteral;
273
274                        continue;
275                    }
276                    _ => {
277                        if remaining <= 1 {
278                            state = State::StringLiteral;
279                            let hex_end = self.offset + 1;
280                            let hex_start = hex_end - 4;
281                            let hex = &self.source[hex_start..hex_end];
282                            // `is_ascii_hexdigit()` checks in previous iterations ensures
283                            // this `unwrap()` does not panic:
284                            let code_point = u32::from_str_radix(hex, 16).unwrap();
285                            if char::from_u32(code_point).is_none() {
286                                // TODO: https://github.com/apollographql/apollo-rs/issues/657 needs
287                                // changes both here and in `ast/node_ext.rs`
288                                let escape_sequence_start = hex_start - 2; // include "\u"
289                                let escape_sequence = &self.source[escape_sequence_start..hex_end];
290                                self.add_err(Error::with_loc(
291                                    "surrogate code point is invalid in unicode escape sequence \
292                                     (paired surrogate not supported yet: \
293                                     https://github.com/apollographql/apollo-rs/issues/657)",
294                                    escape_sequence.to_owned(),
295                                    0,
296                                ));
297                            }
298                            continue;
299                        }
300
301                        state = State::StringLiteralEscapedUnicode(remaining - 1)
302                    }
303                },
304                State::StringLiteral => match c {
305                    '"' => {
306                        token.data = self.current_str();
307                        return self.done(token);
308                    }
309                    curr if is_line_terminator(curr) => {
310                        self.add_err(Error::with_loc(
311                            "unexpected line terminator",
312                            "".to_string(),
313                            0,
314                        ));
315                    }
316                    '\\' => {
317                        state = State::StringLiteralBackslash;
318                    }
319                    _ => {}
320                },
321                State::BlockStringLiteralBackslash => match c {
322                    '"' => {
323                        // If this is \""", we need to eat 3 in total, and then continue parsing.
324                        // The lexer does not un-escape escape sequences so it's OK
325                        // if we take this path for \"", even if that is technically not an escape
326                        // sequence.
327                        if self.eatc('"') {
328                            self.eatc('"');
329                        }
330
331                        state = State::BlockStringLiteral;
332                    }
333                    '\\' => {
334                        // We need to stay in the backslash state:
335                        // it's legal to write \\\""" with two literal backslashes
336                        // and then the escape sequence.
337                    }
338                    _ => {
339                        state = State::BlockStringLiteral;
340                    }
341                },
342                State::StringLiteralBackslash => match c {
343                    curr if is_escaped_char(curr) => {
344                        state = State::StringLiteral;
345                    }
346                    'u' => {
347                        state = State::StringLiteralEscapedUnicode(4);
348                    }
349                    _ => {
350                        self.add_err(Error::with_loc(
351                            "unexpected escaped character",
352                            c.to_string(),
353                            0,
354                        ));
355
356                        state = State::StringLiteral;
357                    }
358                },
359                State::LeadingZero => match c {
360                    '.' => {
361                        token.kind = TokenKind::Float;
362                        state = State::DecimalPoint;
363                    }
364                    'e' | 'E' => {
365                        token.kind = TokenKind::Float;
366                        state = State::ExponentIndicator;
367                    }
368                    _ if c.is_ascii_digit() => {
369                        return Err(Error::with_loc(
370                            "Numbers must not have non-significant leading zeroes",
371                            self.current_str().to_string(),
372                            token.index,
373                        ));
374                    }
375                    _ if lookup::is_namestart(c) => {
376                        return Err(Error::with_loc(
377                            format!("Unexpected character `{c}` as integer suffix"),
378                            self.current_str().to_string(),
379                            token.index,
380                        ));
381                    }
382                    _ => {
383                        token.data = self.prev_str();
384                        return self.done(token);
385                    }
386                },
387                State::IntegerPart => match c {
388                    curr if curr.is_ascii_digit() => {}
389                    '.' => {
390                        token.kind = TokenKind::Float;
391                        state = State::DecimalPoint;
392                    }
393                    'e' | 'E' => {
394                        token.kind = TokenKind::Float;
395                        state = State::ExponentIndicator;
396                    }
397                    _ if lookup::is_namestart(c) => {
398                        return Err(Error::with_loc(
399                            format!("Unexpected character `{c}` as integer suffix"),
400                            self.current_str().to_string(),
401                            token.index,
402                        ));
403                    }
404                    _ => {
405                        token.data = self.prev_str();
406                        return self.done(token);
407                    }
408                },
409                State::DecimalPoint => match c {
410                    curr if curr.is_ascii_digit() => {
411                        state = State::FractionalPart;
412                    }
413                    _ => {
414                        return Err(Error::with_loc(
415                            format!("Unexpected character `{c}`, expected fractional digit"),
416                            self.current_str().to_string(),
417                            token.index,
418                        ));
419                    }
420                },
421                State::FractionalPart => match c {
422                    curr if curr.is_ascii_digit() => {}
423                    'e' | 'E' => {
424                        state = State::ExponentIndicator;
425                    }
426                    _ if c == '.' || lookup::is_namestart(c) => {
427                        return Err(Error::with_loc(
428                            format!("Unexpected character `{c}` as float suffix"),
429                            self.current_str().to_string(),
430                            token.index,
431                        ));
432                    }
433                    _ => {
434                        token.data = self.prev_str();
435                        return self.done(token);
436                    }
437                },
438                State::ExponentIndicator => match c {
439                    _ if c.is_ascii_digit() => {
440                        state = State::ExponentDigit;
441                    }
442                    '+' | '-' => {
443                        state = State::ExponentSign;
444                    }
445                    _ => {
446                        return Err(Error::with_loc(
447                            format!("Unexpected character `{c}`, expected exponent digit or sign"),
448                            self.current_str().to_string(),
449                            token.index,
450                        ))
451                    }
452                },
453                State::ExponentSign => match c {
454                    _ if c.is_ascii_digit() => {
455                        state = State::ExponentDigit;
456                    }
457                    _ => {
458                        return Err(Error::with_loc(
459                            format!("Unexpected character `{c}`, expected exponent digit"),
460                            self.current_str().to_string(),
461                            token.index,
462                        ))
463                    }
464                },
465                State::ExponentDigit => match c {
466                    _ if c.is_ascii_digit() => {
467                        state = State::ExponentDigit;
468                    }
469                    _ if c == '.' || lookup::is_namestart(c) => {
470                        return Err(Error::with_loc(
471                            format!("Unexpected character `{c}` as float suffix"),
472                            self.current_str().to_string(),
473                            token.index,
474                        ));
475                    }
476                    _ => {
477                        token.data = self.prev_str();
478                        return self.done(token);
479                    }
480                },
481                State::SpreadOperator => {
482                    if c == '.' && self.eatc('.') {
483                        token.data = self.current_str();
484                        return Ok(token);
485                    }
486                    return self.unterminated_spread_operator(&token);
487                }
488                State::MinusSign => match c {
489                    '0' => {
490                        state = State::LeadingZero;
491                    }
492                    curr if curr.is_ascii_digit() => {
493                        state = State::IntegerPart;
494                    }
495                    _ => {
496                        return Err(Error::with_loc(
497                            format!("Unexpected character `{c}`"),
498                            self.current_str().to_string(),
499                            token.index,
500                        ))
501                    }
502                },
503                State::Comment => match c {
504                    curr if is_line_terminator(curr) => {
505                        token.data = self.prev_str();
506                        return self.done(token);
507                    }
508                    _ => {}
509                },
510            }
511        }
512    }
513
514    fn eof(&mut self, state: State, mut token: Token<'a>) -> Result<Token<'a>, Error> {
515        match state {
516            State::Start => {
517                token.index += 1;
518                Ok(token)
519            }
520            State::StringLiteralStart => {
521                let curr = self.current_str();
522
523                Err(Error::with_loc(
524                    "unexpected end of data while lexing string value",
525                    curr.to_string(),
526                    token.index,
527                ))
528            }
529            State::StringLiteral
530            | State::BlockStringLiteral
531            | State::StringLiteralEscapedUnicode(_)
532            | State::BlockStringLiteralBackslash
533            | State::StringLiteralBackslash => {
534                let curr = self.drain();
535
536                Err(Error::with_loc(
537                    "unterminated string value",
538                    curr.to_string(),
539                    token.index,
540                ))
541            }
542            State::SpreadOperator => self.unterminated_spread_operator(&token),
543            State::MinusSign => Err(Error::with_loc(
544                "Unexpected character \"-\"",
545                self.current_str().to_string(),
546                token.index,
547            )),
548            State::DecimalPoint | State::ExponentIndicator | State::ExponentSign => {
549                Err(Error::with_loc(
550                    "Unexpected EOF in float value",
551                    self.current_str().to_string(),
552                    token.index,
553                ))
554            }
555            State::Ident
556            | State::LeadingZero
557            | State::IntegerPart
558            | State::FractionalPart
559            | State::ExponentDigit
560            | State::Whitespace
561            | State::Comment => {
562                if let Some(mut err) = self.err() {
563                    err.set_data(self.current_str().to_string());
564                    return Err(err);
565                }
566
567                token.data = self.current_str();
568
569                Ok(token)
570            }
571        }
572    }
573
574    fn unterminated_spread_operator(&mut self, token: &Token<'a>) -> Result<Token<'a>, Error> {
575        let data = if self.is_pending() {
576            self.prev_str()
577        } else {
578            self.current_str()
579        };
580
581        Err(Error::with_loc(
582            "Unterminated spread operator",
583            data.to_string(),
584            token.index,
585        ))
586    }
587
588    fn done(&mut self, token: Token<'a>) -> Result<Token<'a>, Error> {
589        if let Some(mut err) = self.err() {
590            err.set_data(token.data.to_string());
591            err.index = token.index;
592            self.err = None;
593            return Err(err);
594        }
595        Ok(token)
596    }
597}
598
599/// Ignored tokens other than comments and commas are assimilated to whitespace
600/// <https://spec.graphql.org/October2021/#Ignored>
601fn is_whitespace_assimilated(c: char) -> bool {
602    matches!(
603        c,
604        // https://spec.graphql.org/October2021/#WhiteSpace
605        '\u{0009}'   // \t
606        | '\u{0020}' // space
607        // https://spec.graphql.org/October2021/#LineTerminator
608        | '\u{000A}' // \n
609        | '\u{000D}' // \r
610        // https://spec.graphql.org/October2021/#UnicodeBOM
611        | '\u{FEFF}' // Unicode BOM (Byte Order Mark)
612    )
613}
614
615/// <https://spec.graphql.org/October2021/#NameContinue>
616fn is_name_continue(c: char) -> bool {
617    matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
618}
619
620fn is_line_terminator(c: char) -> bool {
621    matches!(c, '\n' | '\r')
622}
623
624// EscapedCharacter
625//     "  \  /  b  f  n  r  t
626fn is_escaped_char(c: char) -> bool {
627    matches!(c, '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't')
628}
629
630#[cfg(test)]
631mod test {
632    use super::*;
633
634    #[test]
635    fn unterminated_string() {
636        let schema = r#"
637type Query {
638    name: String
639    format: String = "Y-m-d\\TH:i:sP"
640}
641        "#;
642        let (tokens, errors) = Lexer::new(schema).lex();
643        dbg!(tokens);
644        dbg!(errors);
645    }
646
647    #[test]
648    fn token_limit() {
649        let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(10);
650        let (tokens, errors) = lexer.lex();
651        assert_eq!(tokens.len(), 10);
652        assert_eq!(
653            errors,
654            &[Error::limit("token limit reached, aborting lexing", 17)]
655        );
656    }
657
658    #[test]
659    fn token_limit_exact() {
660        let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(26);
661        let (tokens, errors) = lexer.lex();
662        assert_eq!(tokens.len(), 26);
663        assert!(errors.is_empty());
664
665        let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(25);
666        let (tokens, errors) = lexer.lex();
667        assert_eq!(tokens.len(), 25);
668        assert_eq!(
669            errors,
670            &[Error::limit("token limit reached, aborting lexing", 31)]
671        );
672    }
673
674    #[test]
675    fn errors_and_token_limit() {
676        let lexer = Lexer::new("type Query { ..a a a a a a a a a }").with_limit(10);
677        let (tokens, errors) = lexer.lex();
678        // Errors contribute to the token limit
679        assert_eq!(tokens.len(), 9);
680        assert_eq!(
681            errors,
682            &[
683                Error::with_loc("Unterminated spread operator", "..".to_string(), 13),
684                Error::limit("token limit reached, aborting lexing", 18),
685            ],
686        );
687    }
688
689    #[test]
690    fn stream_produces_original_input() {
691        let schema = r#"
692type Query {
693    name: String
694    format: String = "Y-m-d\\TH:i:sP"
695}
696        "#;
697
698        let lexer = Lexer::new(schema);
699        let processed_schema = lexer
700            .into_iter()
701            .fold(String::new(), |acc, token| acc + token.unwrap().data());
702
703        assert_eq!(schema, processed_schema);
704    }
705
706    #[test]
707    fn quoted_block_comment() {
708        let input = r#"
709"""
710Not an escape character:
711'/\W/'
712Escape character:
713\"""
714\"""\"""
715Not escape characters:
716\" \""
717Escape character followed by a quote:
718\""""
719"""
720        "#;
721
722        let (tokens, errors) = Lexer::new(input).lex();
723        assert!(errors.is_empty());
724        // The token data should be literally the source text.
725        assert_eq!(
726            tokens[1].data,
727            r#"
728"""
729Not an escape character:
730'/\W/'
731Escape character:
732\"""
733\"""\"""
734Not escape characters:
735\" \""
736Escape character followed by a quote:
737\""""
738"""
739"#
740            .trim(),
741        );
742
743        let input = r#"
744# String contents: """
745"""\""""""
746# Unclosed block string
747"""\"""
748        "#;
749        let (tokens, errors) = Lexer::new(input).lex();
750        assert_eq!(tokens[3].data, r#""""\"""""""#);
751        assert_eq!(
752            errors,
753            &[Error::with_loc(
754                "unterminated string value",
755                r#""""\"""
756        "#
757                .to_string(),
758                59,
759            )]
760        );
761    }
762
763    #[test]
764    fn unexpected_character() {
765        let schema = r#"
766type Query {
767    name: String
768}
769/
770        "#;
771        let (tokens, errors) = Lexer::new(schema).lex();
772        dbg!(tokens);
773        assert_eq!(
774            errors,
775            &[Error::with_loc(
776                "Unexpected character \"/\"",
777                "/".to_string(),
778                33,
779            )]
780        );
781    }
782}