apollo_parser/lexer/
mod.rs

1mod cursor;
2mod lookup;
3mod token;
4mod token_kind;
5
6use crate::lexer::cursor::Cursor;
7use crate::Error;
8use crate::LimitTracker;
9pub use token::Token;
10pub use token_kind::TokenKind;
11
12/// Parses GraphQL source text into tokens.
13/// ```rust
14/// use apollo_parser::Lexer;
15///
16/// let query = "
17/// {
18///     animal
19///     ...snackSelection
20///     ... on Pet {
21///       playmates {
22///         count
23///       }
24///     }
25/// }
26/// ";
27/// let (tokens, errors) = Lexer::new(query).lex();
28/// assert_eq!(errors.len(), 0);
29/// ```
30#[derive(Clone, Debug)]
31pub struct Lexer<'a> {
32    finished: bool,
33    cursor: Cursor<'a>,
34    pub(crate) limit_tracker: LimitTracker,
35}
36
37#[derive(Debug)]
38enum State {
39    Start,
40    Ident,
41    StringLiteralEscapedUnicode(usize),
42    StringLiteral,
43    StringLiteralStart,
44    BlockStringLiteral,
45    BlockStringLiteralBackslash,
46    StringLiteralBackslash,
47    LeadingZero,
48    IntegerPart,
49    DecimalPoint,
50    FractionalPart,
51    ExponentIndicator,
52    ExponentSign,
53    ExponentDigit,
54    Whitespace,
55    Comment,
56    SpreadOperator,
57    MinusSign,
58}
59
60impl<'a> Lexer<'a> {
61    /// Create a lexer for a GraphQL source text.
62    ///
63    /// The Lexer is an iterator over tokens and errors:
64    /// ```rust
65    /// use apollo_parser::Lexer;
66    ///
67    /// let query = "# --- GraphQL here ---";
68    ///
69    /// let mut lexer = Lexer::new(query);
70    /// let mut tokens = vec![];
71    /// for token in lexer {
72    ///     match token {
73    ///         Ok(token) => tokens.push(token),
74    ///         Err(error) => panic!("{:?}", error),
75    ///     }
76    /// }
77    /// ```
78    pub fn new(input: &'a str) -> Self {
79        Self {
80            cursor: Cursor::new(input),
81            finished: false,
82            limit_tracker: LimitTracker::new(usize::MAX),
83        }
84    }
85
86    pub fn with_limit(mut self, limit: usize) -> Self {
87        self.limit_tracker = LimitTracker::new(limit);
88        self
89    }
90
91    /// Lex the full source text, consuming the lexer.
92    pub fn lex(self) -> (Vec<Token<'a>>, Vec<Error>) {
93        let mut tokens = vec![];
94        let mut errors = vec![];
95
96        for item in self {
97            match item {
98                Ok(token) => tokens.push(token),
99                Err(error) => errors.push(error),
100            }
101        }
102
103        (tokens, errors)
104    }
105}
106
107impl<'a> Iterator for Lexer<'a> {
108    type Item = Result<Token<'a>, Error>;
109
110    fn next(&mut self) -> Option<Self::Item> {
111        if self.finished {
112            return None;
113        }
114
115        if self.limit_tracker.check_and_increment() {
116            self.finished = true;
117            return Some(Err(Error::limit(
118                "token limit reached, aborting lexing",
119                self.cursor.index(),
120            )));
121        }
122
123        match self.cursor.advance() {
124            Ok(token) => {
125                if matches!(token.kind(), TokenKind::Eof) {
126                    self.finished = true;
127                }
128
129                Some(Ok(token))
130            }
131            Err(err) => Some(Err(err)),
132        }
133    }
134}
135
136impl<'a> Cursor<'a> {
137    fn advance(&mut self) -> Result<Token<'a>, Error> {
138        let mut state = State::Start;
139        let mut token = Token {
140            kind: TokenKind::Eof,
141            data: "",
142            index: self.index(),
143        };
144
145        loop {
146            let Some(c) = self.bump() else {
147                return self.eof(state, token);
148            };
149            match state {
150                State::Start => {
151                    if let Some(t) = lookup::punctuation_kind(c) {
152                        token.kind = t;
153                        token.data = self.current_str();
154                        return Ok(token);
155                    }
156
157                    if lookup::is_namestart(c) {
158                        token.kind = TokenKind::Name;
159                        state = State::Ident;
160
161                        continue;
162                    }
163
164                    if c != '0' && c.is_ascii_digit() {
165                        token.kind = TokenKind::Int;
166                        state = State::IntegerPart;
167
168                        continue;
169                    }
170
171                    match c {
172                        '"' => {
173                            token.kind = TokenKind::StringValue;
174                            state = State::StringLiteralStart;
175                        }
176                        '#' => {
177                            token.kind = TokenKind::Comment;
178                            state = State::Comment;
179                        }
180                        '.' => {
181                            token.kind = TokenKind::Spread;
182                            state = State::SpreadOperator;
183                        }
184                        '-' => {
185                            token.kind = TokenKind::Int;
186                            state = State::MinusSign;
187                        }
188                        '0' => {
189                            token.kind = TokenKind::Int;
190                            state = State::LeadingZero;
191                        }
192                        c if is_whitespace_assimilated(c) => {
193                            token.kind = TokenKind::Whitespace;
194                            state = State::Whitespace;
195                        }
196                        c => {
197                            return Err(Error::with_loc(
198                                format!(r#"Unexpected character "{c}""#),
199                                self.current_str().to_string(),
200                                token.index,
201                            ));
202                        }
203                    };
204                }
205                State::Ident => match c {
206                    curr if is_name_continue(curr) => {}
207                    _ => {
208                        token.data = self.prev_str();
209                        return self.done(token);
210                    }
211                },
212                State::Whitespace => match c {
213                    curr if is_whitespace_assimilated(curr) => {}
214                    _ => {
215                        token.data = self.prev_str();
216                        return self.done(token);
217                    }
218                },
219                State::BlockStringLiteral => match c {
220                    '\\' => {
221                        state = State::BlockStringLiteralBackslash;
222                    }
223                    '"'
224                        // Require two additional quotes to complete the triple quote.
225                        if self.eatc('"') && self.eatc('"') => {
226                            token.data = self.current_str();
227                            return self.done(token);
228                        }
229                    _ => {}
230                },
231                State::StringLiteralStart => match c {
232                    '"' => {
233                        if self.eatc('"') {
234                            state = State::BlockStringLiteral;
235
236                            continue;
237                        }
238
239                        if self.is_pending() {
240                            token.data = self.prev_str();
241                        } else {
242                            token.data = self.current_str();
243                        }
244                        return self.done(token);
245                    }
246                    '\\' => {
247                        state = State::StringLiteralBackslash;
248                    }
249                    _ => {
250                        state = State::StringLiteral;
251
252                        continue;
253                    }
254                },
255                State::StringLiteralEscapedUnicode(remaining) => match c {
256                    '"' => {
257                        self.add_err(Error::with_loc(
258                            "incomplete unicode escape sequence",
259                            c.to_string(),
260                            token.index,
261                        ));
262                        token.data = self.current_str();
263                        return self.done(token);
264                    }
265                    c if !c.is_ascii_hexdigit() => {
266                        self.add_err(Error::with_loc(
267                            "invalid unicode escape sequence",
268                            c.to_string(),
269                            0,
270                        ));
271                        state = State::StringLiteral;
272
273                        continue;
274                    }
275                    _ => {
276                        if remaining <= 1 {
277                            state = State::StringLiteral;
278                            let hex_end = self.offset + 1;
279                            let hex_start = hex_end - 4;
280                            let hex = &self.source[hex_start..hex_end];
281                            // `is_ascii_hexdigit()` checks in previous iterations ensures
282                            // this `unwrap()` does not panic:
283                            let code_point = u32::from_str_radix(hex, 16).unwrap();
284                            if char::from_u32(code_point).is_none() {
285                                // TODO: https://github.com/apollographql/apollo-rs/issues/657 needs
286                                // changes both here and in `ast/node_ext.rs`
287                                let escape_sequence_start = hex_start - 2; // include "\u"
288                                let escape_sequence = &self.source[escape_sequence_start..hex_end];
289                                self.add_err(Error::with_loc(
290                                    "surrogate code point is invalid in unicode escape sequence \
291                                     (paired surrogate not supported yet: \
292                                     https://github.com/apollographql/apollo-rs/issues/657)",
293                                    escape_sequence.to_owned(),
294                                    0,
295                                ));
296                            }
297                            continue;
298                        }
299
300                        state = State::StringLiteralEscapedUnicode(remaining - 1)
301                    }
302                },
303                State::StringLiteral => match c {
304                    '"' => {
305                        token.data = self.current_str();
306                        return self.done(token);
307                    }
308                    curr if is_line_terminator(curr) => {
309                        self.add_err(Error::with_loc(
310                            "unexpected line terminator",
311                            "".to_string(),
312                            0,
313                        ));
314                    }
315                    '\\' => {
316                        state = State::StringLiteralBackslash;
317                    }
318                    _ => {}
319                },
320                State::BlockStringLiteralBackslash => match c {
321                    '"' => {
322                        // If this is \""", we need to eat 3 in total, and then continue parsing.
323                        // The lexer does not un-escape escape sequences so it's OK
324                        // if we take this path for \"", even if that is technically not an escape
325                        // sequence.
326                        if self.eatc('"') {
327                            self.eatc('"');
328                        }
329
330                        state = State::BlockStringLiteral;
331                    }
332                    '\\' => {
333                        // We need to stay in the backslash state:
334                        // it's legal to write \\\""" with two literal backslashes
335                        // and then the escape sequence.
336                    }
337                    _ => {
338                        state = State::BlockStringLiteral;
339                    }
340                },
341                State::StringLiteralBackslash => match c {
342                    curr if is_escaped_char(curr) => {
343                        state = State::StringLiteral;
344                    }
345                    'u' => {
346                        state = State::StringLiteralEscapedUnicode(4);
347                    }
348                    _ => {
349                        self.add_err(Error::with_loc(
350                            "unexpected escaped character",
351                            c.to_string(),
352                            0,
353                        ));
354
355                        state = State::StringLiteral;
356                    }
357                },
358                State::LeadingZero => match c {
359                    '.' => {
360                        token.kind = TokenKind::Float;
361                        state = State::DecimalPoint;
362                    }
363                    'e' | 'E' => {
364                        token.kind = TokenKind::Float;
365                        state = State::ExponentIndicator;
366                    }
367                    _ if c.is_ascii_digit() => {
368                        return Err(Error::with_loc(
369                            "Numbers must not have non-significant leading zeroes",
370                            self.current_str().to_string(),
371                            token.index,
372                        ));
373                    }
374                    _ if lookup::is_namestart(c) => {
375                        return Err(Error::with_loc(
376                            format!("Unexpected character `{c}` as integer suffix"),
377                            self.current_str().to_string(),
378                            token.index,
379                        ));
380                    }
381                    _ => {
382                        token.data = self.prev_str();
383                        return self.done(token);
384                    }
385                },
386                State::IntegerPart => match c {
387                    curr if curr.is_ascii_digit() => {}
388                    '.' => {
389                        token.kind = TokenKind::Float;
390                        state = State::DecimalPoint;
391                    }
392                    'e' | 'E' => {
393                        token.kind = TokenKind::Float;
394                        state = State::ExponentIndicator;
395                    }
396                    _ if lookup::is_namestart(c) => {
397                        return Err(Error::with_loc(
398                            format!("Unexpected character `{c}` as integer suffix"),
399                            self.current_str().to_string(),
400                            token.index,
401                        ));
402                    }
403                    _ => {
404                        token.data = self.prev_str();
405                        return self.done(token);
406                    }
407                },
408                State::DecimalPoint => match c {
409                    curr if curr.is_ascii_digit() => {
410                        state = State::FractionalPart;
411                    }
412                    _ => {
413                        return Err(Error::with_loc(
414                            format!("Unexpected character `{c}`, expected fractional digit"),
415                            self.current_str().to_string(),
416                            token.index,
417                        ));
418                    }
419                },
420                State::FractionalPart => match c {
421                    curr if curr.is_ascii_digit() => {}
422                    'e' | 'E' => {
423                        state = State::ExponentIndicator;
424                    }
425                    _ if c == '.' || lookup::is_namestart(c) => {
426                        return Err(Error::with_loc(
427                            format!("Unexpected character `{c}` as float suffix"),
428                            self.current_str().to_string(),
429                            token.index,
430                        ));
431                    }
432                    _ => {
433                        token.data = self.prev_str();
434                        return self.done(token);
435                    }
436                },
437                State::ExponentIndicator => match c {
438                    _ if c.is_ascii_digit() => {
439                        state = State::ExponentDigit;
440                    }
441                    '+' | '-' => {
442                        state = State::ExponentSign;
443                    }
444                    _ => {
445                        return Err(Error::with_loc(
446                            format!("Unexpected character `{c}`, expected exponent digit or sign"),
447                            self.current_str().to_string(),
448                            token.index,
449                        ))
450                    }
451                },
452                State::ExponentSign => match c {
453                    _ if c.is_ascii_digit() => {
454                        state = State::ExponentDigit;
455                    }
456                    _ => {
457                        return Err(Error::with_loc(
458                            format!("Unexpected character `{c}`, expected exponent digit"),
459                            self.current_str().to_string(),
460                            token.index,
461                        ))
462                    }
463                },
464                State::ExponentDigit => match c {
465                    _ if c.is_ascii_digit() => {
466                        state = State::ExponentDigit;
467                    }
468                    _ if c == '.' || lookup::is_namestart(c) => {
469                        return Err(Error::with_loc(
470                            format!("Unexpected character `{c}` as float suffix"),
471                            self.current_str().to_string(),
472                            token.index,
473                        ));
474                    }
475                    _ => {
476                        token.data = self.prev_str();
477                        return self.done(token);
478                    }
479                },
480                State::SpreadOperator => {
481                    if c == '.' && self.eatc('.') {
482                        token.data = self.current_str();
483                        return Ok(token);
484                    }
485                    return self.unterminated_spread_operator(&token);
486                }
487                State::MinusSign => match c {
488                    '0' => {
489                        state = State::LeadingZero;
490                    }
491                    curr if curr.is_ascii_digit() => {
492                        state = State::IntegerPart;
493                    }
494                    _ => {
495                        return Err(Error::with_loc(
496                            format!("Unexpected character `{c}`"),
497                            self.current_str().to_string(),
498                            token.index,
499                        ))
500                    }
501                },
502                State::Comment => match c {
503                    curr if is_line_terminator(curr) => {
504                        token.data = self.prev_str();
505                        return self.done(token);
506                    }
507                    _ => {}
508                },
509            }
510        }
511    }
512
513    fn eof(&mut self, state: State, mut token: Token<'a>) -> Result<Token<'a>, Error> {
514        match state {
515            State::Start => {
516                // Report EOF at the end of the input rather than one byte past it.
517                let end = self.source.len();
518                self.offset = end;
519                token.index = end;
520                Ok(token)
521            }
522            State::StringLiteralStart => {
523                let curr = self.current_str();
524
525                Err(Error::with_loc(
526                    "unexpected end of data while lexing string value",
527                    curr.to_string(),
528                    token.index,
529                ))
530            }
531            State::StringLiteral
532            | State::BlockStringLiteral
533            | State::StringLiteralEscapedUnicode(_)
534            | State::BlockStringLiteralBackslash
535            | State::StringLiteralBackslash => {
536                let curr = self.drain();
537
538                Err(Error::with_loc(
539                    "unterminated string value",
540                    curr.to_string(),
541                    token.index,
542                ))
543            }
544            State::SpreadOperator => self.unterminated_spread_operator(&token),
545            State::MinusSign => Err(Error::with_loc(
546                "Unexpected character \"-\"",
547                self.current_str().to_string(),
548                token.index,
549            )),
550            State::DecimalPoint | State::ExponentIndicator | State::ExponentSign => {
551                Err(Error::with_loc(
552                    "Unexpected EOF in float value",
553                    self.current_str().to_string(),
554                    token.index,
555                ))
556            }
557            State::Ident
558            | State::LeadingZero
559            | State::IntegerPart
560            | State::FractionalPart
561            | State::ExponentDigit
562            | State::Whitespace
563            | State::Comment => {
564                if let Some(mut err) = self.err() {
565                    err.set_data(self.current_str().to_string());
566                    return Err(err);
567                }
568
569                token.data = self.current_str();
570
571                Ok(token)
572            }
573        }
574    }
575
576    fn unterminated_spread_operator(&mut self, token: &Token<'a>) -> Result<Token<'a>, Error> {
577        let data = if self.is_pending() {
578            self.prev_str()
579        } else {
580            self.current_str()
581        };
582
583        Err(Error::with_loc(
584            "Unterminated spread operator",
585            data.to_string(),
586            token.index,
587        ))
588    }
589
590    fn done(&mut self, token: Token<'a>) -> Result<Token<'a>, Error> {
591        if let Some(mut err) = self.err() {
592            err.set_data(token.data.to_string());
593            err.index = token.index;
594            self.err = None;
595            return Err(err);
596        }
597        Ok(token)
598    }
599}
600
601/// Ignored tokens other than comments and commas are assimilated to whitespace
602/// <https://spec.graphql.org/October2021/#Ignored>
603fn is_whitespace_assimilated(c: char) -> bool {
604    matches!(
605        c,
606        // https://spec.graphql.org/October2021/#WhiteSpace
607        '\u{0009}'   // \t
608        | '\u{0020}' // space
609        // https://spec.graphql.org/October2021/#LineTerminator
610        | '\u{000A}' // \n
611        | '\u{000D}' // \r
612        // https://spec.graphql.org/October2021/#UnicodeBOM
613        | '\u{FEFF}' // Unicode BOM (Byte Order Mark)
614    )
615}
616
617/// <https://spec.graphql.org/October2021/#NameContinue>
618fn is_name_continue(c: char) -> bool {
619    matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
620}
621
622fn is_line_terminator(c: char) -> bool {
623    matches!(c, '\n' | '\r')
624}
625
626// EscapedCharacter
627//     "  \  /  b  f  n  r  t
628fn is_escaped_char(c: char) -> bool {
629    matches!(c, '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't')
630}
631
632#[cfg(test)]
633mod test {
634    use super::*;
635
636    #[test]
637    fn unterminated_string() {
638        let schema = r#"
639type Query {
640    name: String
641    format: String = "Y-m-d\\TH:i:sP"
642}
643        "#;
644        let (tokens, errors) = Lexer::new(schema).lex();
645        dbg!(tokens);
646        dbg!(errors);
647    }
648
649    #[test]
650    fn token_limit() {
651        let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(10);
652        let (tokens, errors) = lexer.lex();
653        assert_eq!(tokens.len(), 10);
654        assert_eq!(
655            errors,
656            &[Error::limit("token limit reached, aborting lexing", 17)]
657        );
658    }
659
660    #[test]
661    fn token_limit_exact() {
662        let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(26);
663        let (tokens, errors) = lexer.lex();
664        assert_eq!(tokens.len(), 26);
665        assert!(errors.is_empty());
666
667        let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(25);
668        let (tokens, errors) = lexer.lex();
669        assert_eq!(tokens.len(), 25);
670        assert_eq!(
671            errors,
672            &[Error::limit("token limit reached, aborting lexing", 31)]
673        );
674    }
675
676    #[test]
677    fn errors_and_token_limit() {
678        let lexer = Lexer::new("type Query { ..a a a a a a a a a }").with_limit(10);
679        let (tokens, errors) = lexer.lex();
680        // Errors contribute to the token limit
681        assert_eq!(tokens.len(), 9);
682        assert_eq!(
683            errors,
684            &[
685                Error::with_loc("Unterminated spread operator", "..".to_string(), 13),
686                Error::limit("token limit reached, aborting lexing", 18),
687            ],
688        );
689    }
690
691    #[test]
692    fn stream_produces_original_input() {
693        let schema = r#"
694type Query {
695    name: String
696    format: String = "Y-m-d\\TH:i:sP"
697}
698        "#;
699
700        let lexer = Lexer::new(schema);
701        let processed_schema = lexer
702            .into_iter()
703            .fold(String::new(), |acc, token| acc + token.unwrap().data());
704
705        assert_eq!(schema, processed_schema);
706    }
707
708    #[test]
709    fn quoted_block_comment() {
710        let input = r#"
711"""
712Not an escape character:
713'/\W/'
714Escape character:
715\"""
716\"""\"""
717Not escape characters:
718\" \""
719Escape character followed by a quote:
720\""""
721"""
722        "#;
723
724        let (tokens, errors) = Lexer::new(input).lex();
725        assert!(errors.is_empty());
726        // The token data should be literally the source text.
727        assert_eq!(
728            tokens[1].data,
729            r#"
730"""
731Not an escape character:
732'/\W/'
733Escape character:
734\"""
735\"""\"""
736Not escape characters:
737\" \""
738Escape character followed by a quote:
739\""""
740"""
741"#
742            .trim(),
743        );
744
745        let input = r#"
746# String contents: """
747"""\""""""
748# Unclosed block string
749"""\"""
750        "#;
751        let (tokens, errors) = Lexer::new(input).lex();
752        assert_eq!(tokens[3].data, r#""""\"""""""#);
753        assert_eq!(
754            errors,
755            &[Error::with_loc(
756                "unterminated string value",
757                r#""""\"""
758        "#
759                .to_string(),
760                59,
761            )]
762        );
763    }
764
765    #[test]
766    fn unexpected_character() {
767        let schema = r#"
768type Query {
769    name: String
770}
771/
772        "#;
773        let (tokens, errors) = Lexer::new(schema).lex();
774        dbg!(tokens);
775        assert_eq!(
776            errors,
777            &[Error::with_loc(
778                "Unexpected character \"/\"",
779                "/".to_string(),
780                33,
781            )]
782        );
783    }
784}
apollo_parser/lexer/mod.rs

apollo_parser/lexer/
mod.rs