Skip to main content

oxc_graphql_parser/lexer/
mod.rs

1mod cursor;
2mod lookup;
3mod token;
4mod token_kind;
5
6use crate::Error;
7use crate::LimitTracker;
8use crate::lexer::cursor::Cursor;
9pub use token::Token;
10pub use token_kind::TokenKind;
11
12/// Parses GraphQL source text into tokens.
13/// ```rust
14/// use oxc_graphql_parser::Lexer;
15///
16/// let query = "
17/// {
18///     animal
19///     ...snackSelection
20///     ... on Pet {
21///       playmates {
22///         count
23///       }
24///     }
25/// }
26/// ";
27/// let (tokens, errors) = Lexer::new(query).lex();
28/// assert_eq!(errors.len(), 0);
29/// ```
30#[derive(Clone, Debug)]
31pub struct Lexer<'a> {
32    finished: bool,
33    cursor: Cursor<'a>,
34    pub(crate) limit_tracker: LimitTracker,
35}
36
37#[derive(Debug)]
38enum State {
39    Start,
40    StringLiteralEscapedUnicode(usize),
41    StringLiteral,
42    StringLiteralStart,
43    BlockStringLiteral,
44    BlockStringLiteralBackslash,
45    StringLiteralBackslash,
46    LeadingZero,
47    IntegerPart,
48    DecimalPoint,
49    FractionalPart,
50    ExponentIndicator,
51    ExponentSign,
52    ExponentDigit,
53    Comment,
54    SpreadOperator,
55    MinusSign,
56}
57
58impl<'a> Lexer<'a> {
59    /// Create a lexer for a GraphQL source text.
60    ///
61    /// The Lexer is an iterator over tokens and errors:
62    /// ```rust
63    /// use oxc_graphql_parser::Lexer;
64    ///
65    /// let query = "# --- GraphQL here ---";
66    ///
67    /// let mut lexer = Lexer::new(query);
68    /// let mut tokens = vec![];
69    /// for token in lexer {
70    ///     match token {
71    ///         Ok(token) => tokens.push(token),
72    ///         Err(error) => panic!("{:?}", error),
73    ///     }
74    /// }
75    /// ```
76    pub fn new(input: &'a str) -> Self {
77        Self {
78            cursor: Cursor::new(input),
79            finished: false,
80            limit_tracker: LimitTracker::new(usize::MAX),
81        }
82    }
83
84    pub fn with_limit(mut self, limit: usize) -> Self {
85        self.limit_tracker = LimitTracker::new(limit);
86        self
87    }
88
89    /// Lex the full source text, consuming the lexer.
90    pub fn lex(self) -> (Vec<Token<'a>>, Vec<Error>) {
91        let mut tokens = vec![];
92        let mut errors = vec![];
93
94        for item in self {
95            match item {
96                Ok(token) => tokens.push(token),
97                Err(error) => errors.push(error),
98            }
99        }
100
101        (tokens, errors)
102    }
103}
104
105impl<'a> Iterator for Lexer<'a> {
106    type Item = Result<Token<'a>, Error>;
107
108    #[inline]
109    fn next(&mut self) -> Option<Self::Item> {
110        if self.finished {
111            return None;
112        }
113
114        if self.limit_tracker.check_and_increment() {
115            self.finished = true;
116            return Some(Err(Error::limit(
117                "token limit reached, aborting lexing",
118                self.cursor.index(),
119            )));
120        }
121
122        match self.cursor.advance() {
123            Ok(token) => {
124                if matches!(token.kind(), TokenKind::Eof) {
125                    self.finished = true;
126                }
127
128                Some(Ok(token))
129            }
130            Err(err) => Some(Err(err)),
131        }
132    }
133}
134
135impl<'a> Cursor<'a> {
136    fn advance(&mut self) -> Result<Token<'a>, Error> {
137        let mut state = State::Start;
138        let mut token = Token { kind: TokenKind::Eof, data: "", index: self.index() };
139
140        loop {
141            let Some(c) = self.bump() else {
142                return self.eof(state, token);
143            };
144            match state {
145                State::Start => {
146                    if let Some(t) = lookup::punctuation_kind(c) {
147                        token.kind = t;
148                        token.data = self.current_str();
149                        return Ok(token);
150                    }
151
152                    if lookup::is_namestart(c) {
153                        token.kind = TokenKind::Name;
154                        token.data = self.consume_name();
155                        return self.done(token);
156                    }
157
158                    if c != b'0' && c.is_ascii_digit() {
159                        token.kind = TokenKind::Int;
160                        state = State::IntegerPart;
161
162                        continue;
163                    }
164
165                    match c {
166                        b'"' => {
167                            token.kind = TokenKind::StringValue;
168                            state = State::StringLiteralStart;
169                        }
170                        b'#' => {
171                            token.kind = TokenKind::Comment;
172                            state = State::Comment;
173                        }
174                        b'.' => {
175                            token.kind = TokenKind::Spread;
176                            state = State::SpreadOperator;
177                        }
178                        b'-' => {
179                            token.kind = TokenKind::Int;
180                            state = State::MinusSign;
181                        }
182                        b'0' => {
183                            token.kind = TokenKind::Int;
184                            state = State::LeadingZero;
185                        }
186                        c if is_whitespace_assimilated(c) || (c == 0xEF && self.eat_bom()) => {
187                            token.kind = TokenKind::Whitespace;
188                            token.data = self.consume_whitespace();
189                            return self.done(token);
190                        }
191                        c => {
192                            let c = self.char_for_error(c);
193                            return Err(Error::with_loc(
194                                format!(r#"Unexpected character "{c}""#),
195                                self.current_str().to_string(),
196                                token.index,
197                            ));
198                        }
199                    };
200                }
201                State::BlockStringLiteral => match c {
202                    b'\\' => {
203                        state = State::BlockStringLiteralBackslash;
204                    }
205                    b'"'
206                        // Require two additional quotes to complete the triple quote.
207                        if self.eatc(b'"') && self.eatc(b'"') => {
208                            token.data = self.current_str();
209                            return self.done(token);
210                        }
211                    _ => {}
212                },
213                State::StringLiteralStart => match c {
214                    b'"' => {
215                        if self.eatc(b'"') {
216                            state = State::BlockStringLiteral;
217
218                            continue;
219                        }
220
221                        token.data = self.current_str();
222                        return self.done(token);
223                    }
224                    b'\\' => {
225                        state = State::StringLiteralBackslash;
226                    }
227                    _ => {
228                        state = State::StringLiteral;
229
230                        continue;
231                    }
232                },
233                State::StringLiteralEscapedUnicode(remaining) => match c {
234                    b'"' => {
235                        self.add_err(Error::with_loc(
236                            "incomplete unicode escape sequence",
237                            char::from(c).to_string(),
238                            token.index,
239                        ));
240                        token.data = self.current_str();
241                        return self.done(token);
242                    }
243                    c if !c.is_ascii_hexdigit() => {
244                        self.add_err(Error::with_loc(
245                            "invalid unicode escape sequence",
246                            c.to_string(),
247                            0,
248                        ));
249                        state = State::StringLiteral;
250
251                        continue;
252                    }
253                    _ => {
254                        if remaining <= 1 {
255                            state = State::StringLiteral;
256                            let hex_end = self.offset + 1;
257                            let hex_start = hex_end - 4;
258                            let hex = &self.source[hex_start..hex_end];
259                            // `is_ascii_hexdigit()` checks in previous iterations ensures
260                            // this `unwrap()` does not panic:
261                            let code_point = u32::from_str_radix(hex, 16).unwrap();
262                            if char::from_u32(code_point).is_none() {
263                                // TODO: https://github.com/oxc-project/oxc-graphql-parser/issues/657 needs
264                                // changes both here and in `ast/node_ext.rs`
265                                let escape_sequence_start = hex_start - 2; // include "\u"
266                                let escape_sequence = &self.source[escape_sequence_start..hex_end];
267                                self.add_err(Error::with_loc(
268                                    "surrogate code point is invalid in unicode escape sequence \
269                                     (paired surrogate not supported yet: \
270                                     https://github.com/oxc-project/oxc-graphql-parser/issues/657)",
271                                    escape_sequence.to_owned(),
272                                    0,
273                                ));
274                            }
275                            continue;
276                        }
277
278                        state = State::StringLiteralEscapedUnicode(remaining - 1)
279                    }
280                },
281                State::StringLiteral => match c {
282                    b'"' => {
283                        token.data = self.current_str();
284                        return self.done(token);
285                    }
286                    curr if is_line_terminator(curr) => {
287                        self.add_err(Error::with_loc(
288                            "unexpected line terminator",
289                            "".to_string(),
290                            0,
291                        ));
292                    }
293                    b'\\' => {
294                        state = State::StringLiteralBackslash;
295                    }
296                    _ => {}
297                },
298                State::BlockStringLiteralBackslash => match c {
299                    b'"' => {
300                        // If this is \""", we need to eat 3 in total, and then continue parsing.
301                        // The lexer does not un-escape escape sequences so it's OK
302                        // if we take this path for \"", even if that is technically not an escape
303                        // sequence.
304                        if self.eatc(b'"') {
305                            self.eatc(b'"');
306                        }
307
308                        state = State::BlockStringLiteral;
309                    }
310                    b'\\' => {
311                        // We need to stay in the backslash state:
312                        // it's legal to write \\\""" with two literal backslashes
313                        // and then the escape sequence.
314                    }
315                    _ => {
316                        state = State::BlockStringLiteral;
317                    }
318                },
319                State::StringLiteralBackslash => match c {
320                    curr if is_escaped_char(curr) => {
321                        state = State::StringLiteral;
322                    }
323                    b'u' => {
324                        state = State::StringLiteralEscapedUnicode(4);
325                    }
326                    _ => {
327                        let c = self.char_for_error(c);
328                        self.add_err(Error::with_loc(
329                            "unexpected escaped character",
330                            c.to_string(),
331                            0,
332                        ));
333
334                        state = State::StringLiteral;
335                    }
336                },
337                State::LeadingZero => match c {
338                    b'.' => {
339                        token.kind = TokenKind::Float;
340                        state = State::DecimalPoint;
341                    }
342                    b'e' | b'E' => {
343                        token.kind = TokenKind::Float;
344                        state = State::ExponentIndicator;
345                    }
346                    _ if c.is_ascii_digit() => {
347                        return Err(Error::with_loc(
348                            "Numbers must not have non-significant leading zeroes",
349                            self.current_str().to_string(),
350                            token.index,
351                        ));
352                    }
353                    _ if lookup::is_namestart(c) => {
354                        let c = char::from(c);
355                        return Err(Error::with_loc(
356                            format!("Unexpected character `{c}` as integer suffix"),
357                            self.current_str().to_string(),
358                            token.index,
359                        ));
360                    }
361                    _ => {
362                        token.data = self.prev_str();
363                        return self.done(token);
364                    }
365                },
366                State::IntegerPart => match c {
367                    curr if curr.is_ascii_digit() => {}
368                    b'.' => {
369                        token.kind = TokenKind::Float;
370                        state = State::DecimalPoint;
371                    }
372                    b'e' | b'E' => {
373                        token.kind = TokenKind::Float;
374                        state = State::ExponentIndicator;
375                    }
376                    _ if lookup::is_namestart(c) => {
377                        let c = char::from(c);
378                        return Err(Error::with_loc(
379                            format!("Unexpected character `{c}` as integer suffix"),
380                            self.current_str().to_string(),
381                            token.index,
382                        ));
383                    }
384                    _ => {
385                        token.data = self.prev_str();
386                        return self.done(token);
387                    }
388                },
389                State::DecimalPoint => match c {
390                    curr if curr.is_ascii_digit() => {
391                        state = State::FractionalPart;
392                    }
393                    _ => {
394                        let c = self.char_for_error(c);
395                        return Err(Error::with_loc(
396                            format!("Unexpected character `{c}`, expected fractional digit"),
397                            self.current_str().to_string(),
398                            token.index,
399                        ));
400                    }
401                },
402                State::FractionalPart => match c {
403                    curr if curr.is_ascii_digit() => {}
404                    b'e' | b'E' => {
405                        state = State::ExponentIndicator;
406                    }
407                    _ if c == b'.' || lookup::is_namestart(c) => {
408                        let c = char::from(c);
409                        return Err(Error::with_loc(
410                            format!("Unexpected character `{c}` as float suffix"),
411                            self.current_str().to_string(),
412                            token.index,
413                        ));
414                    }
415                    _ => {
416                        token.data = self.prev_str();
417                        return self.done(token);
418                    }
419                },
420                State::ExponentIndicator => match c {
421                    _ if c.is_ascii_digit() => {
422                        state = State::ExponentDigit;
423                    }
424                    b'+' | b'-' => {
425                        state = State::ExponentSign;
426                    }
427                    _ => {
428                        let c = self.char_for_error(c);
429                        return Err(Error::with_loc(
430                            format!("Unexpected character `{c}`, expected exponent digit or sign"),
431                            self.current_str().to_string(),
432                            token.index,
433                        ));
434                    }
435                },
436                State::ExponentSign => match c {
437                    _ if c.is_ascii_digit() => {
438                        state = State::ExponentDigit;
439                    }
440                    _ => {
441                        let c = self.char_for_error(c);
442                        return Err(Error::with_loc(
443                            format!("Unexpected character `{c}`, expected exponent digit"),
444                            self.current_str().to_string(),
445                            token.index,
446                        ));
447                    }
448                },
449                State::ExponentDigit => match c {
450                    _ if c.is_ascii_digit() => {
451                        state = State::ExponentDigit;
452                    }
453                    _ if c == b'.' || lookup::is_namestart(c) => {
454                        let c = char::from(c);
455                        return Err(Error::with_loc(
456                            format!("Unexpected character `{c}` as float suffix"),
457                            self.current_str().to_string(),
458                            token.index,
459                        ));
460                    }
461                    _ => {
462                        token.data = self.prev_str();
463                        return self.done(token);
464                    }
465                },
466                State::SpreadOperator => {
467                    if c == b'.' && self.eatc(b'.') {
468                        token.data = self.current_str();
469                        return Ok(token);
470                    }
471                    return self.unterminated_spread_operator(&token);
472                }
473                State::MinusSign => match c {
474                    b'0' => {
475                        state = State::LeadingZero;
476                    }
477                    curr if curr.is_ascii_digit() => {
478                        state = State::IntegerPart;
479                    }
480                    _ => {
481                        let c = self.char_for_error(c);
482                        return Err(Error::with_loc(
483                            format!("Unexpected character `{c}`"),
484                            self.current_str().to_string(),
485                            token.index,
486                        ));
487                    }
488                },
489                State::Comment => match c {
490                    curr if is_line_terminator(curr) => {
491                        token.data = self.prev_str();
492                        return self.done(token);
493                    }
494                    _ => {}
495                },
496            }
497        }
498    }
499
500    fn char_for_error(&mut self, c: u8) -> char {
501        if c.is_ascii() { char::from(c) } else { self.consume_current_char() }
502    }
503
504    fn eof(&mut self, state: State, mut token: Token<'a>) -> Result<Token<'a>, Error> {
505        match state {
506            State::Start => {
507                // Report EOF at the end of the input rather than one byte past it.
508                let end = self.source.len();
509                self.offset = end;
510                token.index = end;
511                Ok(token)
512            }
513            State::StringLiteralStart => {
514                let curr = self.current_str();
515
516                Err(Error::with_loc(
517                    "unexpected end of data while lexing string value",
518                    curr.to_string(),
519                    token.index,
520                ))
521            }
522            State::StringLiteral
523            | State::BlockStringLiteral
524            | State::StringLiteralEscapedUnicode(_)
525            | State::BlockStringLiteralBackslash
526            | State::StringLiteralBackslash => {
527                let curr = self.drain();
528
529                Err(Error::with_loc("unterminated string value", curr.to_string(), token.index))
530            }
531            State::SpreadOperator => self.unterminated_spread_operator(&token),
532            State::MinusSign => Err(Error::with_loc(
533                "Unexpected character \"-\"",
534                self.current_str().to_string(),
535                token.index,
536            )),
537            State::DecimalPoint | State::ExponentIndicator | State::ExponentSign => {
538                Err(Error::with_loc(
539                    "Unexpected EOF in float value",
540                    self.current_str().to_string(),
541                    token.index,
542                ))
543            }
544            State::LeadingZero
545            | State::IntegerPart
546            | State::FractionalPart
547            | State::ExponentDigit
548            | State::Comment => {
549                if let Some(mut err) = self.err.take() {
550                    err.set_data(self.current_str().to_string());
551                    return Err(err);
552                }
553
554                token.data = self.current_str();
555
556                Ok(token)
557            }
558        }
559    }
560
561    fn unterminated_spread_operator(&mut self, token: &Token<'a>) -> Result<Token<'a>, Error> {
562        let data = self.current_str();
563
564        Err(Error::with_loc("Unterminated spread operator", data.to_string(), token.index))
565    }
566
567    #[inline]
568    fn done(&mut self, token: Token<'a>) -> Result<Token<'a>, Error> {
569        if let Some(mut err) = self.err.take() {
570            err.set_data(token.data.to_string());
571            err.index = token.index;
572            return Err(err);
573        }
574        Ok(token)
575    }
576}
577
578/// Ignored tokens other than comments and commas are assimilated to whitespace
579/// <https://spec.graphql.org/October2021/#Ignored>
580fn is_whitespace_assimilated(c: u8) -> bool {
581    matches!(
582        c,
583        // https://spec.graphql.org/October2021/#WhiteSpace
584        b'\t'
585        | b' '
586        // https://spec.graphql.org/October2021/#LineTerminator
587        | b'\n'
588        | b'\r'
589    )
590}
591
592/// <https://spec.graphql.org/October2021/#NameContinue>
593fn is_name_continue(c: u8) -> bool {
594    matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_')
595}
596
597fn is_line_terminator(c: u8) -> bool {
598    matches!(c, b'\n' | b'\r')
599}
600
601// EscapedCharacter
602//     "  \  /  b  f  n  r  t
603fn is_escaped_char(c: u8) -> bool {
604    matches!(c, b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't')
605}
606
607#[cfg(test)]
608mod test {
609    use super::*;
610
611    #[test]
612    fn unterminated_string() {
613        let schema = r#"
614type Query {
615    name: String
616    format: String = "Y-m-d\\TH:i:sP"
617}
618        "#;
619        let (tokens, errors) = Lexer::new(schema).lex();
620        dbg!(tokens);
621        dbg!(errors);
622    }
623
624    #[test]
625    fn token_limit() {
626        let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(10);
627        let (tokens, errors) = lexer.lex();
628        assert_eq!(tokens.len(), 10);
629        assert_eq!(errors, &[Error::limit("token limit reached, aborting lexing", 17)]);
630    }
631
632    #[test]
633    fn token_limit_exact() {
634        let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(26);
635        let (tokens, errors) = lexer.lex();
636        assert_eq!(tokens.len(), 26);
637        assert!(errors.is_empty());
638
639        let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(25);
640        let (tokens, errors) = lexer.lex();
641        assert_eq!(tokens.len(), 25);
642        assert_eq!(errors, &[Error::limit("token limit reached, aborting lexing", 31)]);
643    }
644
645    #[test]
646    fn errors_and_token_limit() {
647        let lexer = Lexer::new("type Query { ..a a a a a a a a a }").with_limit(10);
648        let (tokens, errors) = lexer.lex();
649        // Errors contribute to the token limit
650        assert_eq!(tokens.len(), 9);
651        assert_eq!(
652            errors,
653            &[
654                Error::with_loc("Unterminated spread operator", "..".to_string(), 13),
655                Error::limit("token limit reached, aborting lexing", 18),
656            ],
657        );
658    }
659
660    #[test]
661    fn stream_produces_original_input() {
662        let schema = r#"
663type Query {
664    name: String
665    format: String = "Y-m-d\\TH:i:sP"
666}
667        "#;
668
669        let lexer = Lexer::new(schema);
670        let processed_schema =
671            lexer.into_iter().fold(String::new(), |acc, token| acc + token.unwrap().data());
672
673        assert_eq!(schema, processed_schema);
674    }
675
676    #[test]
677    fn quoted_block_comment() {
678        let input = r#"
679"""
680Not an escape character:
681'/\W/'
682Escape character:
683\"""
684\"""\"""
685Not escape characters:
686\" \""
687Escape character followed by a quote:
688\""""
689"""
690        "#;
691
692        let (tokens, errors) = Lexer::new(input).lex();
693        assert!(errors.is_empty());
694        // The token data should be literally the source text.
695        assert_eq!(
696            tokens[1].data,
697            r#"
698"""
699Not an escape character:
700'/\W/'
701Escape character:
702\"""
703\"""\"""
704Not escape characters:
705\" \""
706Escape character followed by a quote:
707\""""
708"""
709"#
710            .trim(),
711        );
712
713        let input = r#"
714# String contents: """
715"""\""""""
716# Unclosed block string
717"""\"""
718        "#;
719        let (tokens, errors) = Lexer::new(input).lex();
720        assert_eq!(tokens[3].data, r#""""\"""""""#);
721        assert_eq!(
722            errors,
723            &[Error::with_loc(
724                "unterminated string value",
725                r#""""\"""
726        "#
727                .to_string(),
728                59,
729            )]
730        );
731    }
732
733    #[test]
734    fn unexpected_character() {
735        let schema = r#"
736type Query {
737    name: String
738}
739/
740        "#;
741        let (tokens, errors) = Lexer::new(schema).lex();
742        dbg!(tokens);
743        assert_eq!(errors, &[Error::with_loc("Unexpected character \"/\"", "/".to_string(), 33,)]);
744    }
745}