1mod cursor;
2mod lookup;
3mod token;
4mod token_kind;
5
6use crate::lexer::cursor::Cursor;
7use crate::Error;
8use crate::LimitTracker;
9pub use token::Token;
10pub use token_kind::TokenKind;
11
12#[derive(Clone, Debug)]
31pub struct Lexer<'a> {
32 finished: bool,
33 cursor: Cursor<'a>,
34 pub(crate) limit_tracker: LimitTracker,
35}
36
37#[derive(Debug)]
38enum State {
39 Start,
40 Ident,
41 StringLiteralEscapedUnicode(usize),
42 StringLiteral,
43 StringLiteralStart,
44 BlockStringLiteral,
45 BlockStringLiteralBackslash,
46 StringLiteralBackslash,
47 LeadingZero,
48 IntegerPart,
49 DecimalPoint,
50 FractionalPart,
51 ExponentIndicator,
52 ExponentSign,
53 ExponentDigit,
54 Whitespace,
55 Comment,
56 SpreadOperator,
57 MinusSign,
58}
59
60impl<'a> Lexer<'a> {
61 pub fn new(input: &'a str) -> Self {
79 Self {
80 cursor: Cursor::new(input),
81 finished: false,
82 limit_tracker: LimitTracker::new(usize::MAX),
83 }
84 }
85
86 pub fn with_limit(mut self, limit: usize) -> Self {
87 self.limit_tracker = LimitTracker::new(limit);
88 self
89 }
90
91 pub fn lex(self) -> (Vec<Token<'a>>, Vec<Error>) {
93 let mut tokens = vec![];
94 let mut errors = vec![];
95
96 for item in self {
97 match item {
98 Ok(token) => tokens.push(token),
99 Err(error) => errors.push(error),
100 }
101 }
102
103 (tokens, errors)
104 }
105}
106
107impl<'a> Iterator for Lexer<'a> {
108 type Item = Result<Token<'a>, Error>;
109
110 fn next(&mut self) -> Option<Self::Item> {
111 if self.finished {
112 return None;
113 }
114
115 if self.limit_tracker.check_and_increment() {
116 self.finished = true;
117 return Some(Err(Error::limit(
118 "token limit reached, aborting lexing",
119 self.cursor.index(),
120 )));
121 }
122
123 match self.cursor.advance() {
124 Ok(token) => {
125 if matches!(token.kind(), TokenKind::Eof) {
126 self.finished = true;
127 }
128
129 Some(Ok(token))
130 }
131 Err(err) => Some(Err(err)),
132 }
133 }
134}
135
136impl<'a> Cursor<'a> {
137 fn advance(&mut self) -> Result<Token<'a>, Error> {
138 let mut state = State::Start;
139 let mut token = Token {
140 kind: TokenKind::Eof,
141 data: "",
142 index: self.index(),
143 };
144
145 loop {
146 let Some(c) = self.bump() else {
147 return self.eof(state, token);
148 };
149 match state {
150 State::Start => {
151 if let Some(t) = lookup::punctuation_kind(c) {
152 token.kind = t;
153 token.data = self.current_str();
154 return Ok(token);
155 }
156
157 if lookup::is_namestart(c) {
158 token.kind = TokenKind::Name;
159 state = State::Ident;
160
161 continue;
162 }
163
164 if c != '0' && c.is_ascii_digit() {
165 token.kind = TokenKind::Int;
166 state = State::IntegerPart;
167
168 continue;
169 }
170
171 match c {
172 '"' => {
173 token.kind = TokenKind::StringValue;
174 state = State::StringLiteralStart;
175 }
176 '#' => {
177 token.kind = TokenKind::Comment;
178 state = State::Comment;
179 }
180 '.' => {
181 token.kind = TokenKind::Spread;
182 state = State::SpreadOperator;
183 }
184 '-' => {
185 token.kind = TokenKind::Int;
186 state = State::MinusSign;
187 }
188 '0' => {
189 token.kind = TokenKind::Int;
190 state = State::LeadingZero;
191 }
192 c if is_whitespace_assimilated(c) => {
193 token.kind = TokenKind::Whitespace;
194 state = State::Whitespace;
195 }
196 c => {
197 return Err(Error::with_loc(
198 format!(r#"Unexpected character "{c}""#),
199 self.current_str().to_string(),
200 token.index,
201 ));
202 }
203 };
204 }
205 State::Ident => match c {
206 curr if is_name_continue(curr) => {}
207 _ => {
208 token.data = self.prev_str();
209 return self.done(token);
210 }
211 },
212 State::Whitespace => match c {
213 curr if is_whitespace_assimilated(curr) => {}
214 _ => {
215 token.data = self.prev_str();
216 return self.done(token);
217 }
218 },
219 State::BlockStringLiteral => match c {
220 '\\' => {
221 state = State::BlockStringLiteralBackslash;
222 }
223 '"'
224 if self.eatc('"') && self.eatc('"') => {
226 token.data = self.current_str();
227 return self.done(token);
228 }
229 _ => {}
230 },
231 State::StringLiteralStart => match c {
232 '"' => {
233 if self.eatc('"') {
234 state = State::BlockStringLiteral;
235
236 continue;
237 }
238
239 if self.is_pending() {
240 token.data = self.prev_str();
241 } else {
242 token.data = self.current_str();
243 }
244 return self.done(token);
245 }
246 '\\' => {
247 state = State::StringLiteralBackslash;
248 }
249 _ => {
250 state = State::StringLiteral;
251
252 continue;
253 }
254 },
255 State::StringLiteralEscapedUnicode(remaining) => match c {
256 '"' => {
257 self.add_err(Error::with_loc(
258 "incomplete unicode escape sequence",
259 c.to_string(),
260 token.index,
261 ));
262 token.data = self.current_str();
263 return self.done(token);
264 }
265 c if !c.is_ascii_hexdigit() => {
266 self.add_err(Error::with_loc(
267 "invalid unicode escape sequence",
268 c.to_string(),
269 0,
270 ));
271 state = State::StringLiteral;
272
273 continue;
274 }
275 _ => {
276 if remaining <= 1 {
277 state = State::StringLiteral;
278 let hex_end = self.offset + 1;
279 let hex_start = hex_end - 4;
280 let hex = &self.source[hex_start..hex_end];
281 let code_point = u32::from_str_radix(hex, 16).unwrap();
284 if char::from_u32(code_point).is_none() {
285 let escape_sequence_start = hex_start - 2; let escape_sequence = &self.source[escape_sequence_start..hex_end];
289 self.add_err(Error::with_loc(
290 "surrogate code point is invalid in unicode escape sequence \
291 (paired surrogate not supported yet: \
292 https://github.com/apollographql/apollo-rs/issues/657)",
293 escape_sequence.to_owned(),
294 0,
295 ));
296 }
297 continue;
298 }
299
300 state = State::StringLiteralEscapedUnicode(remaining - 1)
301 }
302 },
303 State::StringLiteral => match c {
304 '"' => {
305 token.data = self.current_str();
306 return self.done(token);
307 }
308 curr if is_line_terminator(curr) => {
309 self.add_err(Error::with_loc(
310 "unexpected line terminator",
311 "".to_string(),
312 0,
313 ));
314 }
315 '\\' => {
316 state = State::StringLiteralBackslash;
317 }
318 _ => {}
319 },
320 State::BlockStringLiteralBackslash => match c {
321 '"' => {
322 if self.eatc('"') {
327 self.eatc('"');
328 }
329
330 state = State::BlockStringLiteral;
331 }
332 '\\' => {
333 }
337 _ => {
338 state = State::BlockStringLiteral;
339 }
340 },
341 State::StringLiteralBackslash => match c {
342 curr if is_escaped_char(curr) => {
343 state = State::StringLiteral;
344 }
345 'u' => {
346 state = State::StringLiteralEscapedUnicode(4);
347 }
348 _ => {
349 self.add_err(Error::with_loc(
350 "unexpected escaped character",
351 c.to_string(),
352 0,
353 ));
354
355 state = State::StringLiteral;
356 }
357 },
358 State::LeadingZero => match c {
359 '.' => {
360 token.kind = TokenKind::Float;
361 state = State::DecimalPoint;
362 }
363 'e' | 'E' => {
364 token.kind = TokenKind::Float;
365 state = State::ExponentIndicator;
366 }
367 _ if c.is_ascii_digit() => {
368 return Err(Error::with_loc(
369 "Numbers must not have non-significant leading zeroes",
370 self.current_str().to_string(),
371 token.index,
372 ));
373 }
374 _ if lookup::is_namestart(c) => {
375 return Err(Error::with_loc(
376 format!("Unexpected character `{c}` as integer suffix"),
377 self.current_str().to_string(),
378 token.index,
379 ));
380 }
381 _ => {
382 token.data = self.prev_str();
383 return self.done(token);
384 }
385 },
386 State::IntegerPart => match c {
387 curr if curr.is_ascii_digit() => {}
388 '.' => {
389 token.kind = TokenKind::Float;
390 state = State::DecimalPoint;
391 }
392 'e' | 'E' => {
393 token.kind = TokenKind::Float;
394 state = State::ExponentIndicator;
395 }
396 _ if lookup::is_namestart(c) => {
397 return Err(Error::with_loc(
398 format!("Unexpected character `{c}` as integer suffix"),
399 self.current_str().to_string(),
400 token.index,
401 ));
402 }
403 _ => {
404 token.data = self.prev_str();
405 return self.done(token);
406 }
407 },
408 State::DecimalPoint => match c {
409 curr if curr.is_ascii_digit() => {
410 state = State::FractionalPart;
411 }
412 _ => {
413 return Err(Error::with_loc(
414 format!("Unexpected character `{c}`, expected fractional digit"),
415 self.current_str().to_string(),
416 token.index,
417 ));
418 }
419 },
420 State::FractionalPart => match c {
421 curr if curr.is_ascii_digit() => {}
422 'e' | 'E' => {
423 state = State::ExponentIndicator;
424 }
425 _ if c == '.' || lookup::is_namestart(c) => {
426 return Err(Error::with_loc(
427 format!("Unexpected character `{c}` as float suffix"),
428 self.current_str().to_string(),
429 token.index,
430 ));
431 }
432 _ => {
433 token.data = self.prev_str();
434 return self.done(token);
435 }
436 },
437 State::ExponentIndicator => match c {
438 _ if c.is_ascii_digit() => {
439 state = State::ExponentDigit;
440 }
441 '+' | '-' => {
442 state = State::ExponentSign;
443 }
444 _ => {
445 return Err(Error::with_loc(
446 format!("Unexpected character `{c}`, expected exponent digit or sign"),
447 self.current_str().to_string(),
448 token.index,
449 ))
450 }
451 },
452 State::ExponentSign => match c {
453 _ if c.is_ascii_digit() => {
454 state = State::ExponentDigit;
455 }
456 _ => {
457 return Err(Error::with_loc(
458 format!("Unexpected character `{c}`, expected exponent digit"),
459 self.current_str().to_string(),
460 token.index,
461 ))
462 }
463 },
464 State::ExponentDigit => match c {
465 _ if c.is_ascii_digit() => {
466 state = State::ExponentDigit;
467 }
468 _ if c == '.' || lookup::is_namestart(c) => {
469 return Err(Error::with_loc(
470 format!("Unexpected character `{c}` as float suffix"),
471 self.current_str().to_string(),
472 token.index,
473 ));
474 }
475 _ => {
476 token.data = self.prev_str();
477 return self.done(token);
478 }
479 },
480 State::SpreadOperator => {
481 if c == '.' && self.eatc('.') {
482 token.data = self.current_str();
483 return Ok(token);
484 }
485 return self.unterminated_spread_operator(&token);
486 }
487 State::MinusSign => match c {
488 '0' => {
489 state = State::LeadingZero;
490 }
491 curr if curr.is_ascii_digit() => {
492 state = State::IntegerPart;
493 }
494 _ => {
495 return Err(Error::with_loc(
496 format!("Unexpected character `{c}`"),
497 self.current_str().to_string(),
498 token.index,
499 ))
500 }
501 },
502 State::Comment => match c {
503 curr if is_line_terminator(curr) => {
504 token.data = self.prev_str();
505 return self.done(token);
506 }
507 _ => {}
508 },
509 }
510 }
511 }
512
513 fn eof(&mut self, state: State, mut token: Token<'a>) -> Result<Token<'a>, Error> {
514 match state {
515 State::Start => {
516 let end = self.source.len();
518 self.offset = end;
519 token.index = end;
520 Ok(token)
521 }
522 State::StringLiteralStart => {
523 let curr = self.current_str();
524
525 Err(Error::with_loc(
526 "unexpected end of data while lexing string value",
527 curr.to_string(),
528 token.index,
529 ))
530 }
531 State::StringLiteral
532 | State::BlockStringLiteral
533 | State::StringLiteralEscapedUnicode(_)
534 | State::BlockStringLiteralBackslash
535 | State::StringLiteralBackslash => {
536 let curr = self.drain();
537
538 Err(Error::with_loc(
539 "unterminated string value",
540 curr.to_string(),
541 token.index,
542 ))
543 }
544 State::SpreadOperator => self.unterminated_spread_operator(&token),
545 State::MinusSign => Err(Error::with_loc(
546 "Unexpected character \"-\"",
547 self.current_str().to_string(),
548 token.index,
549 )),
550 State::DecimalPoint | State::ExponentIndicator | State::ExponentSign => {
551 Err(Error::with_loc(
552 "Unexpected EOF in float value",
553 self.current_str().to_string(),
554 token.index,
555 ))
556 }
557 State::Ident
558 | State::LeadingZero
559 | State::IntegerPart
560 | State::FractionalPart
561 | State::ExponentDigit
562 | State::Whitespace
563 | State::Comment => {
564 if let Some(mut err) = self.err() {
565 err.set_data(self.current_str().to_string());
566 return Err(err);
567 }
568
569 token.data = self.current_str();
570
571 Ok(token)
572 }
573 }
574 }
575
576 fn unterminated_spread_operator(&mut self, token: &Token<'a>) -> Result<Token<'a>, Error> {
577 let data = if self.is_pending() {
578 self.prev_str()
579 } else {
580 self.current_str()
581 };
582
583 Err(Error::with_loc(
584 "Unterminated spread operator",
585 data.to_string(),
586 token.index,
587 ))
588 }
589
590 fn done(&mut self, token: Token<'a>) -> Result<Token<'a>, Error> {
591 if let Some(mut err) = self.err() {
592 err.set_data(token.data.to_string());
593 err.index = token.index;
594 self.err = None;
595 return Err(err);
596 }
597 Ok(token)
598 }
599}
600
601fn is_whitespace_assimilated(c: char) -> bool {
604 matches!(
605 c,
606 '\u{0009}' | '\u{0020}' | '\u{000A}' | '\u{000D}' | '\u{FEFF}' )
615}
616
617fn is_name_continue(c: char) -> bool {
619 matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
620}
621
622fn is_line_terminator(c: char) -> bool {
623 matches!(c, '\n' | '\r')
624}
625
626fn is_escaped_char(c: char) -> bool {
629 matches!(c, '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't')
630}
631
632#[cfg(test)]
633mod test {
634 use super::*;
635
636 #[test]
637 fn unterminated_string() {
638 let schema = r#"
639type Query {
640 name: String
641 format: String = "Y-m-d\\TH:i:sP"
642}
643 "#;
644 let (tokens, errors) = Lexer::new(schema).lex();
645 dbg!(tokens);
646 dbg!(errors);
647 }
648
649 #[test]
650 fn token_limit() {
651 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(10);
652 let (tokens, errors) = lexer.lex();
653 assert_eq!(tokens.len(), 10);
654 assert_eq!(
655 errors,
656 &[Error::limit("token limit reached, aborting lexing", 17)]
657 );
658 }
659
660 #[test]
661 fn token_limit_exact() {
662 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(26);
663 let (tokens, errors) = lexer.lex();
664 assert_eq!(tokens.len(), 26);
665 assert!(errors.is_empty());
666
667 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(25);
668 let (tokens, errors) = lexer.lex();
669 assert_eq!(tokens.len(), 25);
670 assert_eq!(
671 errors,
672 &[Error::limit("token limit reached, aborting lexing", 31)]
673 );
674 }
675
676 #[test]
677 fn errors_and_token_limit() {
678 let lexer = Lexer::new("type Query { ..a a a a a a a a a }").with_limit(10);
679 let (tokens, errors) = lexer.lex();
680 assert_eq!(tokens.len(), 9);
682 assert_eq!(
683 errors,
684 &[
685 Error::with_loc("Unterminated spread operator", "..".to_string(), 13),
686 Error::limit("token limit reached, aborting lexing", 18),
687 ],
688 );
689 }
690
691 #[test]
692 fn stream_produces_original_input() {
693 let schema = r#"
694type Query {
695 name: String
696 format: String = "Y-m-d\\TH:i:sP"
697}
698 "#;
699
700 let lexer = Lexer::new(schema);
701 let processed_schema = lexer
702 .into_iter()
703 .fold(String::new(), |acc, token| acc + token.unwrap().data());
704
705 assert_eq!(schema, processed_schema);
706 }
707
708 #[test]
709 fn quoted_block_comment() {
710 let input = r#"
711"""
712Not an escape character:
713'/\W/'
714Escape character:
715\"""
716\"""\"""
717Not escape characters:
718\" \""
719Escape character followed by a quote:
720\""""
721"""
722 "#;
723
724 let (tokens, errors) = Lexer::new(input).lex();
725 assert!(errors.is_empty());
726 assert_eq!(
728 tokens[1].data,
729 r#"
730"""
731Not an escape character:
732'/\W/'
733Escape character:
734\"""
735\"""\"""
736Not escape characters:
737\" \""
738Escape character followed by a quote:
739\""""
740"""
741"#
742 .trim(),
743 );
744
745 let input = r#"
746# String contents: """
747"""\""""""
748# Unclosed block string
749"""\"""
750 "#;
751 let (tokens, errors) = Lexer::new(input).lex();
752 assert_eq!(tokens[3].data, r#""""\"""""""#);
753 assert_eq!(
754 errors,
755 &[Error::with_loc(
756 "unterminated string value",
757 r#""""\"""
758 "#
759 .to_string(),
760 59,
761 )]
762 );
763 }
764
765 #[test]
766 fn unexpected_character() {
767 let schema = r#"
768type Query {
769 name: String
770}
771/
772 "#;
773 let (tokens, errors) = Lexer::new(schema).lex();
774 dbg!(tokens);
775 assert_eq!(
776 errors,
777 &[Error::with_loc(
778 "Unexpected character \"/\"",
779 "/".to_string(),
780 33,
781 )]
782 );
783 }
784}