1mod cursor;
2mod lookup;
3mod token;
4mod token_kind;
5
6use crate::Error;
7use crate::LimitTracker;
8use crate::lexer::cursor::Cursor;
9use crate::lexer::lookup::ByteClass;
10pub use token::Token;
11pub use token_kind::TokenKind;
12
13#[derive(Clone, Debug)]
32pub struct Lexer<'a> {
33 finished: bool,
34 cursor: Cursor<'a>,
35 pub(crate) limit_tracker: LimitTracker,
36}
37
38#[derive(Debug, Clone, Copy)]
40enum NumberState {
41 MinusSign,
42 LeadingZero,
43 IntegerPart,
44 DecimalPoint,
45 FractionalPart,
46 ExponentIndicator,
47 ExponentSign,
48 ExponentDigit,
49}
50
51impl<'a> Lexer<'a> {
52 pub fn new(input: &'a str) -> Self {
70 Self {
71 cursor: Cursor::new(input),
72 finished: false,
73 limit_tracker: LimitTracker::new(usize::MAX),
74 }
75 }
76
77 pub fn with_limit(mut self, limit: usize) -> Self {
78 self.limit_tracker = LimitTracker::new(limit);
79 self
80 }
81
82 pub fn lex(self) -> (Vec<Token<'a>>, Vec<Error>) {
84 let mut tokens = vec![];
85 let mut errors = vec![];
86
87 for item in self {
88 match item {
89 Ok(token) => tokens.push(token),
90 Err(error) => errors.push(error),
91 }
92 }
93
94 (tokens, errors)
95 }
96
97 pub(crate) fn next_significant(&mut self) -> Option<Result<Token<'a>, Error>> {
104 if self.finished {
105 return None;
106 }
107
108 loop {
109 if self.limit_tracker.check_and_increment() {
110 self.finished = true;
111 return Some(Err(Error::limit(
112 "token limit reached, aborting lexing",
113 self.cursor.index(),
114 )));
115 }
116
117 if self.cursor.skip_trivia() {
118 continue;
119 }
120
121 return match self.cursor.advance() {
122 Ok(token) => {
123 if matches!(token.kind(), TokenKind::Eof) {
124 self.finished = true;
125 }
126
127 Some(Ok(token))
128 }
129 Err(err) => Some(Err(err)),
130 };
131 }
132 }
133}
134
135impl<'a> Iterator for Lexer<'a> {
136 type Item = Result<Token<'a>, Error>;
137
138 #[inline]
139 fn next(&mut self) -> Option<Self::Item> {
140 if self.finished {
141 return None;
142 }
143
144 if self.limit_tracker.check_and_increment() {
145 self.finished = true;
146 return Some(Err(Error::limit(
147 "token limit reached, aborting lexing",
148 self.cursor.index(),
149 )));
150 }
151
152 match self.cursor.advance() {
153 Ok(token) => {
154 if matches!(token.kind(), TokenKind::Eof) {
155 self.finished = true;
156 }
157
158 Some(Ok(token))
159 }
160 Err(err) => Some(Err(err)),
161 }
162 }
163}
164
165impl<'a> Cursor<'a> {
166 fn advance(&mut self) -> Result<Token<'a>, Error> {
167 debug_assert!(self.err.is_none());
170
171 let mut token = Token { kind: TokenKind::Eof, data: "", index: self.index() };
172
173 let Some(c) = self.bump() else {
174 let end = self.source.len();
176 self.offset = end;
177 token.index = end;
178 return Ok(token);
179 };
180
181 match lookup::byte_class(c) {
182 ByteClass::Bang => self.punctuation(token, TokenKind::Bang),
183 ByteClass::Dollar => self.punctuation(token, TokenKind::Dollar),
184 ByteClass::Amp => self.punctuation(token, TokenKind::Amp),
185 ByteClass::LParen => self.punctuation(token, TokenKind::LParen),
186 ByteClass::RParen => self.punctuation(token, TokenKind::RParen),
187 ByteClass::Comma => self.punctuation(token, TokenKind::Comma),
188 ByteClass::Colon => self.punctuation(token, TokenKind::Colon),
189 ByteClass::Eq => self.punctuation(token, TokenKind::Eq),
190 ByteClass::At => self.punctuation(token, TokenKind::At),
191 ByteClass::LBracket => self.punctuation(token, TokenKind::LBracket),
192 ByteClass::RBracket => self.punctuation(token, TokenKind::RBracket),
193 ByteClass::LCurly => self.punctuation(token, TokenKind::LCurly),
194 ByteClass::RCurly => self.punctuation(token, TokenKind::RCurly),
195 ByteClass::Pipe => self.punctuation(token, TokenKind::Pipe),
196 ByteClass::Name => {
197 token.kind = TokenKind::Name;
198 token.data = self.consume_name();
199 Ok(token)
200 }
201 ByteClass::Whitespace => {
202 token.kind = TokenKind::Whitespace;
203 token.data = self.consume_whitespace();
204 Ok(token)
205 }
206 ByteClass::Bom => {
207 if self.eat_bom() {
208 token.kind = TokenKind::Whitespace;
209 token.data = self.consume_whitespace();
210 Ok(token)
211 } else {
212 self.unexpected_character(c, &token)
213 }
214 }
215 ByteClass::Quote => self.lex_string_start(token),
216 ByteClass::Hash => self.lex_comment(token),
217 ByteClass::Dot => self.lex_spread(token),
218 ByteClass::Zero => self.lex_number(NumberState::LeadingZero, token),
219 ByteClass::Digit => self.lex_number(NumberState::IntegerPart, token),
220 ByteClass::Minus => self.lex_number(NumberState::MinusSign, token),
221 ByteClass::Other => self.unexpected_character(c, &token),
222 }
223 }
224
225 fn skip_trivia(&mut self) -> bool {
229 let Some(&c) = self.bytes.get(self.next) else {
230 return false;
231 };
232 match lookup::byte_class(c) {
233 ByteClass::Whitespace => {
234 self.bump();
235 self.consume_whitespace();
236 true
237 }
238 ByteClass::Comma => {
239 self.bump();
240 let _ = self.current_str();
242 true
243 }
244 ByteClass::Bom if self.at_bom() => {
245 self.bump();
246 self.eat_bom();
247 self.consume_whitespace();
248 true
249 }
250 _ => false,
251 }
252 }
253
254 #[inline]
255 fn punctuation(&mut self, mut token: Token<'a>, kind: TokenKind) -> Result<Token<'a>, Error> {
256 token.kind = kind;
257 token.data = self.current_str();
258 Ok(token)
259 }
260
261 fn lex_comment(&mut self, mut token: Token<'a>) -> Result<Token<'a>, Error> {
262 token.kind = TokenKind::Comment;
263 let start = self.index;
264 let end = self.seek_line_end();
265 token.data = &self.source[start..end];
266 Ok(token)
267 }
268
269 fn lex_spread(&mut self, mut token: Token<'a>) -> Result<Token<'a>, Error> {
270 token.kind = TokenKind::Spread;
271 if let Some(c) = self.bump() {
272 if c == b'.' {
273 if self.eatc(b'.') {
274 token.data = self.current_str();
275 return Ok(token);
276 }
277 } else if !c.is_ascii() {
278 self.consume_current_char();
281 }
282 }
283 let data = self.current_str();
284 Err(Error::with_loc("Unterminated spread operator", data.to_string(), token.index))
285 }
286
287 fn lex_string_start(&mut self, mut token: Token<'a>) -> Result<Token<'a>, Error> {
288 token.kind = TokenKind::StringValue;
289
290 if self.eatc(b'"') {
291 if self.eatc(b'"') {
292 return self.lex_block_string(token);
293 }
294
295 token.data = self.current_str();
297 return Ok(token);
298 }
299
300 if self.next == self.bytes.len() {
301 return Err(Error::with_loc(
303 "unexpected end of data while lexing string value",
304 self.current_str().to_string(),
305 token.index,
306 ));
307 }
308
309 self.lex_string(token)
310 }
311
312 fn lex_string(&mut self, mut token: Token<'a>) -> Result<Token<'a>, Error> {
313 loop {
314 let Some(found) = memchr::memchr2(b'"', b'\\', &self.bytes[self.next..]) else {
315 return self.unterminated_string(&token);
316 };
317 let stop = self.next + found;
318
319 if memchr::memchr2(b'\n', b'\r', &self.bytes[self.next..stop]).is_some() {
320 self.add_err(Error::with_loc("unexpected line terminator", String::new(), 0));
321 }
322
323 self.offset = stop;
325 self.next = stop + 1;
326
327 if self.bytes[stop] == b'"' {
328 token.data = self.current_str();
329 return self.done(token);
330 }
331
332 let Some(c) = self.bump() else {
334 return self.unterminated_string(&token);
335 };
336 if c == b'u' {
337 for remaining in (1..=4usize).rev() {
340 let Some(c) = self.bump() else {
341 return self.unterminated_string(&token);
342 };
343 if c == b'"' {
344 self.add_err(Error::with_loc(
345 "incomplete unicode escape sequence",
346 char::from(c).to_string(),
347 token.index,
348 ));
349 token.data = self.current_str();
350 return self.done(token);
351 }
352 if !c.is_ascii_hexdigit() {
353 self.add_err(Error::with_loc(
354 "invalid unicode escape sequence",
355 c.to_string(),
356 0,
357 ));
358 break;
359 }
360 if remaining == 1 {
361 let hex_end = self.offset + 1;
362 let hex_start = hex_end - 4;
363 let hex = &self.source[hex_start..hex_end];
364 let code_point = u32::from_str_radix(hex, 16).unwrap();
367 if char::from_u32(code_point).is_none() {
368 let escape_sequence_start = hex_start - 2; let escape_sequence = &self.source[escape_sequence_start..hex_end];
372 self.add_err(Error::with_loc(
373 "surrogate code point is invalid in unicode escape sequence \
374 (paired surrogate not supported yet: \
375 https://github.com/oxc-project/oxc-graphql-parser/issues/657)",
376 escape_sequence.to_owned(),
377 0,
378 ));
379 }
380 }
381 }
382 } else if !is_escaped_char(c) {
383 let c = self.char_for_error(c);
384 self.add_err(Error::with_loc("unexpected escaped character", c.to_string(), 0));
385 }
386 }
387 }
388
389 fn lex_block_string(&mut self, mut token: Token<'a>) -> Result<Token<'a>, Error> {
390 loop {
391 let Some(found) = memchr::memchr2(b'"', b'\\', &self.bytes[self.next..]) else {
392 return self.unterminated_string(&token);
393 };
394 let stop = self.next + found;
395
396 self.offset = stop;
398 self.next = stop + 1;
399
400 if self.bytes[stop] == b'"' {
401 if self.eatc(b'"') && self.eatc(b'"') {
404 token.data = self.current_str();
405 return self.done(token);
406 }
407 continue;
408 }
409
410 loop {
416 let Some(c) = self.bump() else {
417 return self.unterminated_string(&token);
418 };
419 match c {
420 b'\\' => {}
421 b'"' => {
422 if self.eatc(b'"') {
423 self.eatc(b'"');
424 }
425 break;
426 }
427 _ => break,
428 }
429 }
430 }
431 }
432
433 fn lex_number(
434 &mut self,
435 mut state: NumberState,
436 mut token: Token<'a>,
437 ) -> Result<Token<'a>, Error> {
438 token.kind = TokenKind::Int;
439
440 loop {
441 let Some(c) = self.bump() else {
442 return match state {
443 NumberState::MinusSign => Err(Error::with_loc(
444 "Unexpected character \"-\"",
445 self.current_str().to_string(),
446 token.index,
447 )),
448 NumberState::DecimalPoint
449 | NumberState::ExponentIndicator
450 | NumberState::ExponentSign => Err(Error::with_loc(
451 "Unexpected EOF in float value",
452 self.current_str().to_string(),
453 token.index,
454 )),
455 NumberState::LeadingZero
456 | NumberState::IntegerPart
457 | NumberState::FractionalPart
458 | NumberState::ExponentDigit => {
459 token.data = self.current_str();
460 Ok(token)
461 }
462 };
463 };
464
465 match state {
466 NumberState::MinusSign => match c {
467 b'0' => {
468 state = NumberState::LeadingZero;
469 }
470 curr if curr.is_ascii_digit() => {
471 state = NumberState::IntegerPart;
472 }
473 _ => {
474 let c = self.char_for_error(c);
475 return Err(Error::with_loc(
476 format!("Unexpected character `{c}`"),
477 self.current_str().to_string(),
478 token.index,
479 ));
480 }
481 },
482 NumberState::LeadingZero => match c {
483 b'.' => {
484 token.kind = TokenKind::Float;
485 state = NumberState::DecimalPoint;
486 }
487 b'e' | b'E' => {
488 token.kind = TokenKind::Float;
489 state = NumberState::ExponentIndicator;
490 }
491 _ if c.is_ascii_digit() => {
492 return Err(Error::with_loc(
493 "Numbers must not have non-significant leading zeroes",
494 self.current_str().to_string(),
495 token.index,
496 ));
497 }
498 _ if lookup::is_namestart(c) => {
499 let c = char::from(c);
500 return Err(Error::with_loc(
501 format!("Unexpected character `{c}` as integer suffix"),
502 self.current_str().to_string(),
503 token.index,
504 ));
505 }
506 _ => {
507 token.data = self.prev_str();
508 return Ok(token);
509 }
510 },
511 NumberState::IntegerPart => match c {
512 curr if curr.is_ascii_digit() => {}
513 b'.' => {
514 token.kind = TokenKind::Float;
515 state = NumberState::DecimalPoint;
516 }
517 b'e' | b'E' => {
518 token.kind = TokenKind::Float;
519 state = NumberState::ExponentIndicator;
520 }
521 _ if lookup::is_namestart(c) => {
522 let c = char::from(c);
523 return Err(Error::with_loc(
524 format!("Unexpected character `{c}` as integer suffix"),
525 self.current_str().to_string(),
526 token.index,
527 ));
528 }
529 _ => {
530 token.data = self.prev_str();
531 return Ok(token);
532 }
533 },
534 NumberState::DecimalPoint => match c {
535 curr if curr.is_ascii_digit() => {
536 state = NumberState::FractionalPart;
537 }
538 _ => {
539 let c = self.char_for_error(c);
540 return Err(Error::with_loc(
541 format!("Unexpected character `{c}`, expected fractional digit"),
542 self.current_str().to_string(),
543 token.index,
544 ));
545 }
546 },
547 NumberState::FractionalPart => match c {
548 curr if curr.is_ascii_digit() => {}
549 b'e' | b'E' => {
550 state = NumberState::ExponentIndicator;
551 }
552 _ if c == b'.' || lookup::is_namestart(c) => {
553 let c = char::from(c);
554 return Err(Error::with_loc(
555 format!("Unexpected character `{c}` as float suffix"),
556 self.current_str().to_string(),
557 token.index,
558 ));
559 }
560 _ => {
561 token.data = self.prev_str();
562 return Ok(token);
563 }
564 },
565 NumberState::ExponentIndicator => match c {
566 _ if c.is_ascii_digit() => {
567 state = NumberState::ExponentDigit;
568 }
569 b'+' | b'-' => {
570 state = NumberState::ExponentSign;
571 }
572 _ => {
573 let c = self.char_for_error(c);
574 return Err(Error::with_loc(
575 format!("Unexpected character `{c}`, expected exponent digit or sign"),
576 self.current_str().to_string(),
577 token.index,
578 ));
579 }
580 },
581 NumberState::ExponentSign => match c {
582 _ if c.is_ascii_digit() => {
583 state = NumberState::ExponentDigit;
584 }
585 _ => {
586 let c = self.char_for_error(c);
587 return Err(Error::with_loc(
588 format!("Unexpected character `{c}`, expected exponent digit"),
589 self.current_str().to_string(),
590 token.index,
591 ));
592 }
593 },
594 NumberState::ExponentDigit => match c {
595 _ if c.is_ascii_digit() => {}
596 _ if c == b'.' || lookup::is_namestart(c) => {
597 let c = char::from(c);
598 return Err(Error::with_loc(
599 format!("Unexpected character `{c}` as float suffix"),
600 self.current_str().to_string(),
601 token.index,
602 ));
603 }
604 _ => {
605 token.data = self.prev_str();
606 return Ok(token);
607 }
608 },
609 }
610 }
611 }
612
613 fn unexpected_character(&mut self, c: u8, token: &Token<'a>) -> Result<Token<'a>, Error> {
614 let c = self.char_for_error(c);
615 Err(Error::with_loc(
616 format!(r#"Unexpected character "{c}""#),
617 self.current_str().to_string(),
618 token.index,
619 ))
620 }
621
622 fn unterminated_string(&mut self, token: &Token<'a>) -> Result<Token<'a>, Error> {
623 self.err = None;
626 Err(Error::with_loc("unterminated string value", self.drain().to_string(), token.index))
627 }
628
629 fn char_for_error(&mut self, c: u8) -> char {
630 if c.is_ascii() { char::from(c) } else { self.consume_current_char() }
631 }
632
633 #[inline]
634 fn done(&mut self, token: Token<'a>) -> Result<Token<'a>, Error> {
635 if let Some(mut err) = self.err.take() {
636 err.set_data(token.data.to_string());
637 err.index = token.index;
638 return Err(err);
639 }
640 Ok(token)
641 }
642}
643
644fn is_whitespace_assimilated(c: u8) -> bool {
647 matches!(
648 c,
649 b'\t'
651 | b' '
652 | b'\n'
654 | b'\r'
655 )
656}
657
658fn is_name_continue(c: u8) -> bool {
660 matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_')
661}
662
663fn is_escaped_char(c: u8) -> bool {
666 matches!(c, b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't')
667}
668
669#[cfg(test)]
670mod test {
671 use super::*;
672
673 #[test]
674 fn unterminated_string() {
675 let schema = r#"
676type Query {
677 name: String
678 format: String = "Y-m-d\\TH:i:sP"
679}
680 "#;
681 let (tokens, errors) = Lexer::new(schema).lex();
682 dbg!(tokens);
683 dbg!(errors);
684 }
685
686 #[test]
687 fn token_limit() {
688 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(10);
689 let (tokens, errors) = lexer.lex();
690 assert_eq!(tokens.len(), 10);
691 assert_eq!(errors, &[Error::limit("token limit reached, aborting lexing", 17)]);
692 }
693
694 #[test]
695 fn token_limit_exact() {
696 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(26);
697 let (tokens, errors) = lexer.lex();
698 assert_eq!(tokens.len(), 26);
699 assert!(errors.is_empty());
700
701 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(25);
702 let (tokens, errors) = lexer.lex();
703 assert_eq!(tokens.len(), 25);
704 assert_eq!(errors, &[Error::limit("token limit reached, aborting lexing", 31)]);
705 }
706
707 #[test]
708 fn errors_and_token_limit() {
709 let lexer = Lexer::new("type Query { ..a a a a a a a a a }").with_limit(10);
710 let (tokens, errors) = lexer.lex();
711 assert_eq!(tokens.len(), 9);
713 assert_eq!(
714 errors,
715 &[
716 Error::with_loc("Unterminated spread operator", "..".to_string(), 13),
717 Error::limit("token limit reached, aborting lexing", 18),
718 ],
719 );
720 }
721
722 #[test]
723 fn stream_produces_original_input() {
724 let schema = r#"
725type Query {
726 name: String
727 format: String = "Y-m-d\\TH:i:sP"
728}
729 "#;
730
731 let lexer = Lexer::new(schema);
732 let processed_schema =
733 lexer.into_iter().fold(String::new(), |acc, token| acc + token.unwrap().data());
734
735 assert_eq!(schema, processed_schema);
736 }
737
738 #[test]
739 fn quoted_block_comment() {
740 let input = r#"
741"""
742Not an escape character:
743'/\W/'
744Escape character:
745\"""
746\"""\"""
747Not escape characters:
748\" \""
749Escape character followed by a quote:
750\""""
751"""
752 "#;
753
754 let (tokens, errors) = Lexer::new(input).lex();
755 assert!(errors.is_empty());
756 assert_eq!(
758 tokens[1].data,
759 r#"
760"""
761Not an escape character:
762'/\W/'
763Escape character:
764\"""
765\"""\"""
766Not escape characters:
767\" \""
768Escape character followed by a quote:
769\""""
770"""
771"#
772 .trim(),
773 );
774
775 let input = r#"
776# String contents: """
777"""\""""""
778# Unclosed block string
779"""\"""
780 "#;
781 let (tokens, errors) = Lexer::new(input).lex();
782 assert_eq!(tokens[3].data, r#""""\"""""""#);
783 assert_eq!(
784 errors,
785 &[Error::with_loc(
786 "unterminated string value",
787 r#""""\"""
788 "#
789 .to_string(),
790 59,
791 )]
792 );
793 }
794
795 #[test]
796 fn unexpected_character() {
797 let schema = r#"
798type Query {
799 name: String
800}
801/
802 "#;
803 let (tokens, errors) = Lexer::new(schema).lex();
804 dbg!(tokens);
805 assert_eq!(errors, &[Error::with_loc("Unexpected character \"/\"", "/".to_string(), 33,)]);
806 }
807
808 #[test]
809 fn spread_followed_by_multibyte_character() {
810 let (tokens, errors) = Lexer::new(".\u{20AC}").lex();
812 assert_eq!(tokens.len(), 1); assert_eq!(
814 errors,
815 &[Error::with_loc("Unterminated spread operator", ".\u{20AC}".to_string(), 0,)]
816 );
817 }
818}