1mod cursor;
2mod lookup;
3mod token;
4mod token_kind;
5
6use crate::Error;
7use crate::LimitTracker;
8use crate::lexer::cursor::Cursor;
9pub use token::Token;
10pub use token_kind::TokenKind;
11
12#[derive(Clone, Debug)]
31pub struct Lexer<'a> {
32 finished: bool,
33 cursor: Cursor<'a>,
34 pub(crate) limit_tracker: LimitTracker,
35}
36
37#[derive(Debug)]
38enum State {
39 Start,
40 StringLiteralEscapedUnicode(usize),
41 StringLiteral,
42 StringLiteralStart,
43 BlockStringLiteral,
44 BlockStringLiteralBackslash,
45 StringLiteralBackslash,
46 LeadingZero,
47 IntegerPart,
48 DecimalPoint,
49 FractionalPart,
50 ExponentIndicator,
51 ExponentSign,
52 ExponentDigit,
53 Comment,
54 SpreadOperator,
55 MinusSign,
56}
57
58impl<'a> Lexer<'a> {
59 pub fn new(input: &'a str) -> Self {
77 Self {
78 cursor: Cursor::new(input),
79 finished: false,
80 limit_tracker: LimitTracker::new(usize::MAX),
81 }
82 }
83
84 pub fn with_limit(mut self, limit: usize) -> Self {
85 self.limit_tracker = LimitTracker::new(limit);
86 self
87 }
88
89 pub fn lex(self) -> (Vec<Token<'a>>, Vec<Error>) {
91 let mut tokens = vec![];
92 let mut errors = vec![];
93
94 for item in self {
95 match item {
96 Ok(token) => tokens.push(token),
97 Err(error) => errors.push(error),
98 }
99 }
100
101 (tokens, errors)
102 }
103}
104
105impl<'a> Iterator for Lexer<'a> {
106 type Item = Result<Token<'a>, Error>;
107
108 #[inline]
109 fn next(&mut self) -> Option<Self::Item> {
110 if self.finished {
111 return None;
112 }
113
114 if self.limit_tracker.check_and_increment() {
115 self.finished = true;
116 return Some(Err(Error::limit(
117 "token limit reached, aborting lexing",
118 self.cursor.index(),
119 )));
120 }
121
122 match self.cursor.advance() {
123 Ok(token) => {
124 if matches!(token.kind(), TokenKind::Eof) {
125 self.finished = true;
126 }
127
128 Some(Ok(token))
129 }
130 Err(err) => Some(Err(err)),
131 }
132 }
133}
134
135impl<'a> Cursor<'a> {
136 fn advance(&mut self) -> Result<Token<'a>, Error> {
137 let mut state = State::Start;
138 let mut token = Token { kind: TokenKind::Eof, data: "", index: self.index() };
139
140 loop {
141 let Some(c) = self.bump() else {
142 return self.eof(state, token);
143 };
144 match state {
145 State::Start => {
146 if let Some(t) = lookup::punctuation_kind(c) {
147 token.kind = t;
148 token.data = self.current_str();
149 return Ok(token);
150 }
151
152 if lookup::is_namestart(c) {
153 token.kind = TokenKind::Name;
154 token.data = self.consume_name();
155 return self.done(token);
156 }
157
158 if c != b'0' && c.is_ascii_digit() {
159 token.kind = TokenKind::Int;
160 state = State::IntegerPart;
161
162 continue;
163 }
164
165 match c {
166 b'"' => {
167 token.kind = TokenKind::StringValue;
168 state = State::StringLiteralStart;
169 }
170 b'#' => {
171 token.kind = TokenKind::Comment;
172 state = State::Comment;
173 }
174 b'.' => {
175 token.kind = TokenKind::Spread;
176 state = State::SpreadOperator;
177 }
178 b'-' => {
179 token.kind = TokenKind::Int;
180 state = State::MinusSign;
181 }
182 b'0' => {
183 token.kind = TokenKind::Int;
184 state = State::LeadingZero;
185 }
186 c if is_whitespace_assimilated(c) || (c == 0xEF && self.eat_bom()) => {
187 token.kind = TokenKind::Whitespace;
188 token.data = self.consume_whitespace();
189 return self.done(token);
190 }
191 c => {
192 let c = self.char_for_error(c);
193 return Err(Error::with_loc(
194 format!(r#"Unexpected character "{c}""#),
195 self.current_str().to_string(),
196 token.index,
197 ));
198 }
199 };
200 }
201 State::BlockStringLiteral => match c {
202 b'\\' => {
203 state = State::BlockStringLiteralBackslash;
204 }
205 b'"'
206 if self.eatc(b'"') && self.eatc(b'"') => {
208 token.data = self.current_str();
209 return self.done(token);
210 }
211 _ => {}
212 },
213 State::StringLiteralStart => match c {
214 b'"' => {
215 if self.eatc(b'"') {
216 state = State::BlockStringLiteral;
217
218 continue;
219 }
220
221 token.data = self.current_str();
222 return self.done(token);
223 }
224 b'\\' => {
225 state = State::StringLiteralBackslash;
226 }
227 _ => {
228 state = State::StringLiteral;
229
230 continue;
231 }
232 },
233 State::StringLiteralEscapedUnicode(remaining) => match c {
234 b'"' => {
235 self.add_err(Error::with_loc(
236 "incomplete unicode escape sequence",
237 char::from(c).to_string(),
238 token.index,
239 ));
240 token.data = self.current_str();
241 return self.done(token);
242 }
243 c if !c.is_ascii_hexdigit() => {
244 self.add_err(Error::with_loc(
245 "invalid unicode escape sequence",
246 c.to_string(),
247 0,
248 ));
249 state = State::StringLiteral;
250
251 continue;
252 }
253 _ => {
254 if remaining <= 1 {
255 state = State::StringLiteral;
256 let hex_end = self.offset + 1;
257 let hex_start = hex_end - 4;
258 let hex = &self.source[hex_start..hex_end];
259 let code_point = u32::from_str_radix(hex, 16).unwrap();
262 if char::from_u32(code_point).is_none() {
263 let escape_sequence_start = hex_start - 2; let escape_sequence = &self.source[escape_sequence_start..hex_end];
267 self.add_err(Error::with_loc(
268 "surrogate code point is invalid in unicode escape sequence \
269 (paired surrogate not supported yet: \
270 https://github.com/oxc-project/oxc-graphql-parser/issues/657)",
271 escape_sequence.to_owned(),
272 0,
273 ));
274 }
275 continue;
276 }
277
278 state = State::StringLiteralEscapedUnicode(remaining - 1)
279 }
280 },
281 State::StringLiteral => match c {
282 b'"' => {
283 token.data = self.current_str();
284 return self.done(token);
285 }
286 curr if is_line_terminator(curr) => {
287 self.add_err(Error::with_loc(
288 "unexpected line terminator",
289 "".to_string(),
290 0,
291 ));
292 }
293 b'\\' => {
294 state = State::StringLiteralBackslash;
295 }
296 _ => {}
297 },
298 State::BlockStringLiteralBackslash => match c {
299 b'"' => {
300 if self.eatc(b'"') {
305 self.eatc(b'"');
306 }
307
308 state = State::BlockStringLiteral;
309 }
310 b'\\' => {
311 }
315 _ => {
316 state = State::BlockStringLiteral;
317 }
318 },
319 State::StringLiteralBackslash => match c {
320 curr if is_escaped_char(curr) => {
321 state = State::StringLiteral;
322 }
323 b'u' => {
324 state = State::StringLiteralEscapedUnicode(4);
325 }
326 _ => {
327 let c = self.char_for_error(c);
328 self.add_err(Error::with_loc(
329 "unexpected escaped character",
330 c.to_string(),
331 0,
332 ));
333
334 state = State::StringLiteral;
335 }
336 },
337 State::LeadingZero => match c {
338 b'.' => {
339 token.kind = TokenKind::Float;
340 state = State::DecimalPoint;
341 }
342 b'e' | b'E' => {
343 token.kind = TokenKind::Float;
344 state = State::ExponentIndicator;
345 }
346 _ if c.is_ascii_digit() => {
347 return Err(Error::with_loc(
348 "Numbers must not have non-significant leading zeroes",
349 self.current_str().to_string(),
350 token.index,
351 ));
352 }
353 _ if lookup::is_namestart(c) => {
354 let c = char::from(c);
355 return Err(Error::with_loc(
356 format!("Unexpected character `{c}` as integer suffix"),
357 self.current_str().to_string(),
358 token.index,
359 ));
360 }
361 _ => {
362 token.data = self.prev_str();
363 return self.done(token);
364 }
365 },
366 State::IntegerPart => match c {
367 curr if curr.is_ascii_digit() => {}
368 b'.' => {
369 token.kind = TokenKind::Float;
370 state = State::DecimalPoint;
371 }
372 b'e' | b'E' => {
373 token.kind = TokenKind::Float;
374 state = State::ExponentIndicator;
375 }
376 _ if lookup::is_namestart(c) => {
377 let c = char::from(c);
378 return Err(Error::with_loc(
379 format!("Unexpected character `{c}` as integer suffix"),
380 self.current_str().to_string(),
381 token.index,
382 ));
383 }
384 _ => {
385 token.data = self.prev_str();
386 return self.done(token);
387 }
388 },
389 State::DecimalPoint => match c {
390 curr if curr.is_ascii_digit() => {
391 state = State::FractionalPart;
392 }
393 _ => {
394 let c = self.char_for_error(c);
395 return Err(Error::with_loc(
396 format!("Unexpected character `{c}`, expected fractional digit"),
397 self.current_str().to_string(),
398 token.index,
399 ));
400 }
401 },
402 State::FractionalPart => match c {
403 curr if curr.is_ascii_digit() => {}
404 b'e' | b'E' => {
405 state = State::ExponentIndicator;
406 }
407 _ if c == b'.' || lookup::is_namestart(c) => {
408 let c = char::from(c);
409 return Err(Error::with_loc(
410 format!("Unexpected character `{c}` as float suffix"),
411 self.current_str().to_string(),
412 token.index,
413 ));
414 }
415 _ => {
416 token.data = self.prev_str();
417 return self.done(token);
418 }
419 },
420 State::ExponentIndicator => match c {
421 _ if c.is_ascii_digit() => {
422 state = State::ExponentDigit;
423 }
424 b'+' | b'-' => {
425 state = State::ExponentSign;
426 }
427 _ => {
428 let c = self.char_for_error(c);
429 return Err(Error::with_loc(
430 format!("Unexpected character `{c}`, expected exponent digit or sign"),
431 self.current_str().to_string(),
432 token.index,
433 ));
434 }
435 },
436 State::ExponentSign => match c {
437 _ if c.is_ascii_digit() => {
438 state = State::ExponentDigit;
439 }
440 _ => {
441 let c = self.char_for_error(c);
442 return Err(Error::with_loc(
443 format!("Unexpected character `{c}`, expected exponent digit"),
444 self.current_str().to_string(),
445 token.index,
446 ));
447 }
448 },
449 State::ExponentDigit => match c {
450 _ if c.is_ascii_digit() => {
451 state = State::ExponentDigit;
452 }
453 _ if c == b'.' || lookup::is_namestart(c) => {
454 let c = char::from(c);
455 return Err(Error::with_loc(
456 format!("Unexpected character `{c}` as float suffix"),
457 self.current_str().to_string(),
458 token.index,
459 ));
460 }
461 _ => {
462 token.data = self.prev_str();
463 return self.done(token);
464 }
465 },
466 State::SpreadOperator => {
467 if c == b'.' && self.eatc(b'.') {
468 token.data = self.current_str();
469 return Ok(token);
470 }
471 return self.unterminated_spread_operator(&token);
472 }
473 State::MinusSign => match c {
474 b'0' => {
475 state = State::LeadingZero;
476 }
477 curr if curr.is_ascii_digit() => {
478 state = State::IntegerPart;
479 }
480 _ => {
481 let c = self.char_for_error(c);
482 return Err(Error::with_loc(
483 format!("Unexpected character `{c}`"),
484 self.current_str().to_string(),
485 token.index,
486 ));
487 }
488 },
489 State::Comment => match c {
490 curr if is_line_terminator(curr) => {
491 token.data = self.prev_str();
492 return self.done(token);
493 }
494 _ => {}
495 },
496 }
497 }
498 }
499
500 fn char_for_error(&mut self, c: u8) -> char {
501 if c.is_ascii() { char::from(c) } else { self.consume_current_char() }
502 }
503
504 fn eof(&mut self, state: State, mut token: Token<'a>) -> Result<Token<'a>, Error> {
505 match state {
506 State::Start => {
507 let end = self.source.len();
509 self.offset = end;
510 token.index = end;
511 Ok(token)
512 }
513 State::StringLiteralStart => {
514 let curr = self.current_str();
515
516 Err(Error::with_loc(
517 "unexpected end of data while lexing string value",
518 curr.to_string(),
519 token.index,
520 ))
521 }
522 State::StringLiteral
523 | State::BlockStringLiteral
524 | State::StringLiteralEscapedUnicode(_)
525 | State::BlockStringLiteralBackslash
526 | State::StringLiteralBackslash => {
527 let curr = self.drain();
528
529 Err(Error::with_loc("unterminated string value", curr.to_string(), token.index))
530 }
531 State::SpreadOperator => self.unterminated_spread_operator(&token),
532 State::MinusSign => Err(Error::with_loc(
533 "Unexpected character \"-\"",
534 self.current_str().to_string(),
535 token.index,
536 )),
537 State::DecimalPoint | State::ExponentIndicator | State::ExponentSign => {
538 Err(Error::with_loc(
539 "Unexpected EOF in float value",
540 self.current_str().to_string(),
541 token.index,
542 ))
543 }
544 State::LeadingZero
545 | State::IntegerPart
546 | State::FractionalPart
547 | State::ExponentDigit
548 | State::Comment => {
549 if let Some(mut err) = self.err.take() {
550 err.set_data(self.current_str().to_string());
551 return Err(err);
552 }
553
554 token.data = self.current_str();
555
556 Ok(token)
557 }
558 }
559 }
560
561 fn unterminated_spread_operator(&mut self, token: &Token<'a>) -> Result<Token<'a>, Error> {
562 let data = self.current_str();
563
564 Err(Error::with_loc("Unterminated spread operator", data.to_string(), token.index))
565 }
566
567 #[inline]
568 fn done(&mut self, token: Token<'a>) -> Result<Token<'a>, Error> {
569 if let Some(mut err) = self.err.take() {
570 err.set_data(token.data.to_string());
571 err.index = token.index;
572 return Err(err);
573 }
574 Ok(token)
575 }
576}
577
578fn is_whitespace_assimilated(c: u8) -> bool {
581 matches!(
582 c,
583 b'\t'
585 | b' '
586 | b'\n'
588 | b'\r'
589 )
590}
591
592fn is_name_continue(c: u8) -> bool {
594 matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_')
595}
596
597fn is_line_terminator(c: u8) -> bool {
598 matches!(c, b'\n' | b'\r')
599}
600
601fn is_escaped_char(c: u8) -> bool {
604 matches!(c, b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't')
605}
606
607#[cfg(test)]
608mod test {
609 use super::*;
610
611 #[test]
612 fn unterminated_string() {
613 let schema = r#"
614type Query {
615 name: String
616 format: String = "Y-m-d\\TH:i:sP"
617}
618 "#;
619 let (tokens, errors) = Lexer::new(schema).lex();
620 dbg!(tokens);
621 dbg!(errors);
622 }
623
624 #[test]
625 fn token_limit() {
626 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(10);
627 let (tokens, errors) = lexer.lex();
628 assert_eq!(tokens.len(), 10);
629 assert_eq!(errors, &[Error::limit("token limit reached, aborting lexing", 17)]);
630 }
631
632 #[test]
633 fn token_limit_exact() {
634 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(26);
635 let (tokens, errors) = lexer.lex();
636 assert_eq!(tokens.len(), 26);
637 assert!(errors.is_empty());
638
639 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(25);
640 let (tokens, errors) = lexer.lex();
641 assert_eq!(tokens.len(), 25);
642 assert_eq!(errors, &[Error::limit("token limit reached, aborting lexing", 31)]);
643 }
644
645 #[test]
646 fn errors_and_token_limit() {
647 let lexer = Lexer::new("type Query { ..a a a a a a a a a }").with_limit(10);
648 let (tokens, errors) = lexer.lex();
649 assert_eq!(tokens.len(), 9);
651 assert_eq!(
652 errors,
653 &[
654 Error::with_loc("Unterminated spread operator", "..".to_string(), 13),
655 Error::limit("token limit reached, aborting lexing", 18),
656 ],
657 );
658 }
659
660 #[test]
661 fn stream_produces_original_input() {
662 let schema = r#"
663type Query {
664 name: String
665 format: String = "Y-m-d\\TH:i:sP"
666}
667 "#;
668
669 let lexer = Lexer::new(schema);
670 let processed_schema =
671 lexer.into_iter().fold(String::new(), |acc, token| acc + token.unwrap().data());
672
673 assert_eq!(schema, processed_schema);
674 }
675
676 #[test]
677 fn quoted_block_comment() {
678 let input = r#"
679"""
680Not an escape character:
681'/\W/'
682Escape character:
683\"""
684\"""\"""
685Not escape characters:
686\" \""
687Escape character followed by a quote:
688\""""
689"""
690 "#;
691
692 let (tokens, errors) = Lexer::new(input).lex();
693 assert!(errors.is_empty());
694 assert_eq!(
696 tokens[1].data,
697 r#"
698"""
699Not an escape character:
700'/\W/'
701Escape character:
702\"""
703\"""\"""
704Not escape characters:
705\" \""
706Escape character followed by a quote:
707\""""
708"""
709"#
710 .trim(),
711 );
712
713 let input = r#"
714# String contents: """
715"""\""""""
716# Unclosed block string
717"""\"""
718 "#;
719 let (tokens, errors) = Lexer::new(input).lex();
720 assert_eq!(tokens[3].data, r#""""\"""""""#);
721 assert_eq!(
722 errors,
723 &[Error::with_loc(
724 "unterminated string value",
725 r#""""\"""
726 "#
727 .to_string(),
728 59,
729 )]
730 );
731 }
732
733 #[test]
734 fn unexpected_character() {
735 let schema = r#"
736type Query {
737 name: String
738}
739/
740 "#;
741 let (tokens, errors) = Lexer::new(schema).lex();
742 dbg!(tokens);
743 assert_eq!(errors, &[Error::with_loc("Unexpected character \"/\"", "/".to_string(), 33,)]);
744 }
745}