1mod cursor;
2mod lookup;
3mod token;
4mod token_kind;
5
6use crate::lexer::cursor::Cursor;
7use crate::Error;
8use crate::LimitTracker;
9pub use token::Token;
10pub use token_kind::TokenKind;
11
12#[derive(Clone, Debug)]
31pub struct Lexer<'a> {
32 finished: bool,
33 cursor: Cursor<'a>,
34 pub(crate) limit_tracker: LimitTracker,
35}
36
37#[derive(Debug)]
38enum State {
39 Start,
40 Ident,
41 StringLiteralEscapedUnicode(usize),
42 StringLiteral,
43 StringLiteralStart,
44 BlockStringLiteral,
45 BlockStringLiteralBackslash,
46 StringLiteralBackslash,
47 LeadingZero,
48 IntegerPart,
49 DecimalPoint,
50 FractionalPart,
51 ExponentIndicator,
52 ExponentSign,
53 ExponentDigit,
54 Whitespace,
55 Comment,
56 SpreadOperator,
57 MinusSign,
58}
59
60impl<'a> Lexer<'a> {
61 pub fn new(input: &'a str) -> Self {
79 Self {
80 cursor: Cursor::new(input),
81 finished: false,
82 limit_tracker: LimitTracker::new(usize::MAX),
83 }
84 }
85
86 pub fn with_limit(mut self, limit: usize) -> Self {
87 self.limit_tracker = LimitTracker::new(limit);
88 self
89 }
90
91 pub fn lex(self) -> (Vec<Token<'a>>, Vec<Error>) {
93 let mut tokens = vec![];
94 let mut errors = vec![];
95
96 for item in self {
97 match item {
98 Ok(token) => tokens.push(token),
99 Err(error) => errors.push(error),
100 }
101 }
102
103 (tokens, errors)
104 }
105}
106
107impl<'a> Iterator for Lexer<'a> {
108 type Item = Result<Token<'a>, Error>;
109
110 fn next(&mut self) -> Option<Self::Item> {
111 if self.finished {
112 return None;
113 }
114
115 if self.limit_tracker.check_and_increment() {
116 self.finished = true;
117 return Some(Err(Error::limit(
118 "token limit reached, aborting lexing",
119 self.cursor.index(),
120 )));
121 }
122
123 match self.cursor.advance() {
124 Ok(token) => {
125 if matches!(token.kind(), TokenKind::Eof) {
126 self.finished = true;
127 }
128
129 Some(Ok(token))
130 }
131 Err(err) => Some(Err(err)),
132 }
133 }
134}
135
136impl<'a> Cursor<'a> {
137 fn advance(&mut self) -> Result<Token<'a>, Error> {
138 let mut state = State::Start;
139 let mut token = Token {
140 kind: TokenKind::Eof,
141 data: "",
142 index: self.index(),
143 };
144
145 loop {
146 let Some(c) = self.bump() else {
147 return self.eof(state, token);
148 };
149 match state {
150 State::Start => {
151 if let Some(t) = lookup::punctuation_kind(c) {
152 token.kind = t;
153 token.data = self.current_str();
154 return Ok(token);
155 }
156
157 if lookup::is_namestart(c) {
158 token.kind = TokenKind::Name;
159 state = State::Ident;
160
161 continue;
162 }
163
164 if c != '0' && c.is_ascii_digit() {
165 token.kind = TokenKind::Int;
166 state = State::IntegerPart;
167
168 continue;
169 }
170
171 match c {
172 '"' => {
173 token.kind = TokenKind::StringValue;
174 state = State::StringLiteralStart;
175 }
176 '#' => {
177 token.kind = TokenKind::Comment;
178 state = State::Comment;
179 }
180 '.' => {
181 token.kind = TokenKind::Spread;
182 state = State::SpreadOperator;
183 }
184 '-' => {
185 token.kind = TokenKind::Int;
186 state = State::MinusSign;
187 }
188 '0' => {
189 token.kind = TokenKind::Int;
190 state = State::LeadingZero;
191 }
192 c if is_whitespace_assimilated(c) => {
193 token.kind = TokenKind::Whitespace;
194 state = State::Whitespace;
195 }
196 c => {
197 return Err(Error::with_loc(
198 format!("Unexpected character \"{}\"", c),
199 self.current_str().to_string(),
200 token.index,
201 ));
202 }
203 };
204 }
205 State::Ident => match c {
206 curr if is_name_continue(curr) => {}
207 _ => {
208 token.data = self.prev_str();
209 return self.done(token);
210 }
211 },
212 State::Whitespace => match c {
213 curr if is_whitespace_assimilated(curr) => {}
214 _ => {
215 token.data = self.prev_str();
216 return self.done(token);
217 }
218 },
219 State::BlockStringLiteral => match c {
220 '\\' => {
221 state = State::BlockStringLiteralBackslash;
222 }
223 '"' => {
224 if self.eatc('"') && self.eatc('"') {
226 token.data = self.current_str();
227 return self.done(token);
228 }
229 }
230 _ => {}
231 },
232 State::StringLiteralStart => match c {
233 '"' => {
234 if self.eatc('"') {
235 state = State::BlockStringLiteral;
236
237 continue;
238 }
239
240 if self.is_pending() {
241 token.data = self.prev_str();
242 } else {
243 token.data = self.current_str();
244 }
245 return self.done(token);
246 }
247 '\\' => {
248 state = State::StringLiteralBackslash;
249 }
250 _ => {
251 state = State::StringLiteral;
252
253 continue;
254 }
255 },
256 State::StringLiteralEscapedUnicode(remaining) => match c {
257 '"' => {
258 self.add_err(Error::with_loc(
259 "incomplete unicode escape sequence",
260 c.to_string(),
261 token.index,
262 ));
263 token.data = self.current_str();
264 return self.done(token);
265 }
266 c if !c.is_ascii_hexdigit() => {
267 self.add_err(Error::with_loc(
268 "invalid unicode escape sequence",
269 c.to_string(),
270 0,
271 ));
272 state = State::StringLiteral;
273
274 continue;
275 }
276 _ => {
277 if remaining <= 1 {
278 state = State::StringLiteral;
279 let hex_end = self.offset + 1;
280 let hex_start = hex_end - 4;
281 let hex = &self.source[hex_start..hex_end];
282 let code_point = u32::from_str_radix(hex, 16).unwrap();
285 if char::from_u32(code_point).is_none() {
286 let escape_sequence_start = hex_start - 2; let escape_sequence = &self.source[escape_sequence_start..hex_end];
290 self.add_err(Error::with_loc(
291 "surrogate code point is invalid in unicode escape sequence \
292 (paired surrogate not supported yet: \
293 https://github.com/apollographql/apollo-rs/issues/657)",
294 escape_sequence.to_owned(),
295 0,
296 ));
297 }
298 continue;
299 }
300
301 state = State::StringLiteralEscapedUnicode(remaining - 1)
302 }
303 },
304 State::StringLiteral => match c {
305 '"' => {
306 token.data = self.current_str();
307 return self.done(token);
308 }
309 curr if is_line_terminator(curr) => {
310 self.add_err(Error::with_loc(
311 "unexpected line terminator",
312 "".to_string(),
313 0,
314 ));
315 }
316 '\\' => {
317 state = State::StringLiteralBackslash;
318 }
319 _ => {}
320 },
321 State::BlockStringLiteralBackslash => match c {
322 '"' => {
323 if self.eatc('"') {
328 self.eatc('"');
329 }
330
331 state = State::BlockStringLiteral;
332 }
333 '\\' => {
334 }
338 _ => {
339 state = State::BlockStringLiteral;
340 }
341 },
342 State::StringLiteralBackslash => match c {
343 curr if is_escaped_char(curr) => {
344 state = State::StringLiteral;
345 }
346 'u' => {
347 state = State::StringLiteralEscapedUnicode(4);
348 }
349 _ => {
350 self.add_err(Error::with_loc(
351 "unexpected escaped character",
352 c.to_string(),
353 0,
354 ));
355
356 state = State::StringLiteral;
357 }
358 },
359 State::LeadingZero => match c {
360 '.' => {
361 token.kind = TokenKind::Float;
362 state = State::DecimalPoint;
363 }
364 'e' | 'E' => {
365 token.kind = TokenKind::Float;
366 state = State::ExponentIndicator;
367 }
368 _ if c.is_ascii_digit() => {
369 return Err(Error::with_loc(
370 "Numbers must not have non-significant leading zeroes",
371 self.current_str().to_string(),
372 token.index,
373 ));
374 }
375 _ if lookup::is_namestart(c) => {
376 return Err(Error::with_loc(
377 format!("Unexpected character `{c}` as integer suffix"),
378 self.current_str().to_string(),
379 token.index,
380 ));
381 }
382 _ => {
383 token.data = self.prev_str();
384 return self.done(token);
385 }
386 },
387 State::IntegerPart => match c {
388 curr if curr.is_ascii_digit() => {}
389 '.' => {
390 token.kind = TokenKind::Float;
391 state = State::DecimalPoint;
392 }
393 'e' | 'E' => {
394 token.kind = TokenKind::Float;
395 state = State::ExponentIndicator;
396 }
397 _ if lookup::is_namestart(c) => {
398 return Err(Error::with_loc(
399 format!("Unexpected character `{c}` as integer suffix"),
400 self.current_str().to_string(),
401 token.index,
402 ));
403 }
404 _ => {
405 token.data = self.prev_str();
406 return self.done(token);
407 }
408 },
409 State::DecimalPoint => match c {
410 curr if curr.is_ascii_digit() => {
411 state = State::FractionalPart;
412 }
413 _ => {
414 return Err(Error::with_loc(
415 format!("Unexpected character `{c}`, expected fractional digit"),
416 self.current_str().to_string(),
417 token.index,
418 ));
419 }
420 },
421 State::FractionalPart => match c {
422 curr if curr.is_ascii_digit() => {}
423 'e' | 'E' => {
424 state = State::ExponentIndicator;
425 }
426 _ if c == '.' || lookup::is_namestart(c) => {
427 return Err(Error::with_loc(
428 format!("Unexpected character `{c}` as float suffix"),
429 self.current_str().to_string(),
430 token.index,
431 ));
432 }
433 _ => {
434 token.data = self.prev_str();
435 return self.done(token);
436 }
437 },
438 State::ExponentIndicator => match c {
439 _ if c.is_ascii_digit() => {
440 state = State::ExponentDigit;
441 }
442 '+' | '-' => {
443 state = State::ExponentSign;
444 }
445 _ => {
446 return Err(Error::with_loc(
447 format!("Unexpected character `{c}`, expected exponent digit or sign"),
448 self.current_str().to_string(),
449 token.index,
450 ))
451 }
452 },
453 State::ExponentSign => match c {
454 _ if c.is_ascii_digit() => {
455 state = State::ExponentDigit;
456 }
457 _ => {
458 return Err(Error::with_loc(
459 format!("Unexpected character `{c}`, expected exponent digit"),
460 self.current_str().to_string(),
461 token.index,
462 ))
463 }
464 },
465 State::ExponentDigit => match c {
466 _ if c.is_ascii_digit() => {
467 state = State::ExponentDigit;
468 }
469 _ if c == '.' || lookup::is_namestart(c) => {
470 return Err(Error::with_loc(
471 format!("Unexpected character `{c}` as float suffix"),
472 self.current_str().to_string(),
473 token.index,
474 ));
475 }
476 _ => {
477 token.data = self.prev_str();
478 return self.done(token);
479 }
480 },
481 State::SpreadOperator => {
482 if c == '.' && self.eatc('.') {
483 token.data = self.current_str();
484 return Ok(token);
485 }
486 return self.unterminated_spread_operator(&token);
487 }
488 State::MinusSign => match c {
489 '0' => {
490 state = State::LeadingZero;
491 }
492 curr if curr.is_ascii_digit() => {
493 state = State::IntegerPart;
494 }
495 _ => {
496 return Err(Error::with_loc(
497 format!("Unexpected character `{c}`"),
498 self.current_str().to_string(),
499 token.index,
500 ))
501 }
502 },
503 State::Comment => match c {
504 curr if is_line_terminator(curr) => {
505 token.data = self.prev_str();
506 return self.done(token);
507 }
508 _ => {}
509 },
510 }
511 }
512 }
513
514 fn eof(&mut self, state: State, mut token: Token<'a>) -> Result<Token<'a>, Error> {
515 match state {
516 State::Start => {
517 token.index += 1;
518 Ok(token)
519 }
520 State::StringLiteralStart => {
521 let curr = self.current_str();
522
523 Err(Error::with_loc(
524 "unexpected end of data while lexing string value",
525 curr.to_string(),
526 token.index,
527 ))
528 }
529 State::StringLiteral
530 | State::BlockStringLiteral
531 | State::StringLiteralEscapedUnicode(_)
532 | State::BlockStringLiteralBackslash
533 | State::StringLiteralBackslash => {
534 let curr = self.drain();
535
536 Err(Error::with_loc(
537 "unterminated string value",
538 curr.to_string(),
539 token.index,
540 ))
541 }
542 State::SpreadOperator => self.unterminated_spread_operator(&token),
543 State::MinusSign => Err(Error::with_loc(
544 "Unexpected character \"-\"",
545 self.current_str().to_string(),
546 token.index,
547 )),
548 State::DecimalPoint | State::ExponentIndicator | State::ExponentSign => {
549 Err(Error::with_loc(
550 "Unexpected EOF in float value",
551 self.current_str().to_string(),
552 token.index,
553 ))
554 }
555 State::Ident
556 | State::LeadingZero
557 | State::IntegerPart
558 | State::FractionalPart
559 | State::ExponentDigit
560 | State::Whitespace
561 | State::Comment => {
562 if let Some(mut err) = self.err() {
563 err.set_data(self.current_str().to_string());
564 return Err(err);
565 }
566
567 token.data = self.current_str();
568
569 Ok(token)
570 }
571 }
572 }
573
574 fn unterminated_spread_operator(&mut self, token: &Token<'a>) -> Result<Token<'a>, Error> {
575 let data = if self.is_pending() {
576 self.prev_str()
577 } else {
578 self.current_str()
579 };
580
581 Err(Error::with_loc(
582 "Unterminated spread operator",
583 data.to_string(),
584 token.index,
585 ))
586 }
587
588 fn done(&mut self, token: Token<'a>) -> Result<Token<'a>, Error> {
589 if let Some(mut err) = self.err() {
590 err.set_data(token.data.to_string());
591 err.index = token.index;
592 self.err = None;
593 return Err(err);
594 }
595 Ok(token)
596 }
597}
598
599fn is_whitespace_assimilated(c: char) -> bool {
602 matches!(
603 c,
604 '\u{0009}' | '\u{0020}' | '\u{000A}' | '\u{000D}' | '\u{FEFF}' )
613}
614
615fn is_name_continue(c: char) -> bool {
617 matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
618}
619
620fn is_line_terminator(c: char) -> bool {
621 matches!(c, '\n' | '\r')
622}
623
624fn is_escaped_char(c: char) -> bool {
627 matches!(c, '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't')
628}
629
630#[cfg(test)]
631mod test {
632 use super::*;
633
634 #[test]
635 fn unterminated_string() {
636 let schema = r#"
637type Query {
638 name: String
639 format: String = "Y-m-d\\TH:i:sP"
640}
641 "#;
642 let (tokens, errors) = Lexer::new(schema).lex();
643 dbg!(tokens);
644 dbg!(errors);
645 }
646
647 #[test]
648 fn token_limit() {
649 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(10);
650 let (tokens, errors) = lexer.lex();
651 assert_eq!(tokens.len(), 10);
652 assert_eq!(
653 errors,
654 &[Error::limit("token limit reached, aborting lexing", 17)]
655 );
656 }
657
658 #[test]
659 fn token_limit_exact() {
660 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(26);
661 let (tokens, errors) = lexer.lex();
662 assert_eq!(tokens.len(), 26);
663 assert!(errors.is_empty());
664
665 let lexer = Lexer::new("type Query { a a a a a a a a a }").with_limit(25);
666 let (tokens, errors) = lexer.lex();
667 assert_eq!(tokens.len(), 25);
668 assert_eq!(
669 errors,
670 &[Error::limit("token limit reached, aborting lexing", 31)]
671 );
672 }
673
674 #[test]
675 fn errors_and_token_limit() {
676 let lexer = Lexer::new("type Query { ..a a a a a a a a a }").with_limit(10);
677 let (tokens, errors) = lexer.lex();
678 assert_eq!(tokens.len(), 9);
680 assert_eq!(
681 errors,
682 &[
683 Error::with_loc("Unterminated spread operator", "..".to_string(), 13),
684 Error::limit("token limit reached, aborting lexing", 18),
685 ],
686 );
687 }
688
689 #[test]
690 fn stream_produces_original_input() {
691 let schema = r#"
692type Query {
693 name: String
694 format: String = "Y-m-d\\TH:i:sP"
695}
696 "#;
697
698 let lexer = Lexer::new(schema);
699 let processed_schema = lexer
700 .into_iter()
701 .fold(String::new(), |acc, token| acc + token.unwrap().data());
702
703 assert_eq!(schema, processed_schema);
704 }
705
706 #[test]
707 fn quoted_block_comment() {
708 let input = r#"
709"""
710Not an escape character:
711'/\W/'
712Escape character:
713\"""
714\"""\"""
715Not escape characters:
716\" \""
717Escape character followed by a quote:
718\""""
719"""
720 "#;
721
722 let (tokens, errors) = Lexer::new(input).lex();
723 assert!(errors.is_empty());
724 assert_eq!(
726 tokens[1].data,
727 r#"
728"""
729Not an escape character:
730'/\W/'
731Escape character:
732\"""
733\"""\"""
734Not escape characters:
735\" \""
736Escape character followed by a quote:
737\""""
738"""
739"#
740 .trim(),
741 );
742
743 let input = r#"
744# String contents: """
745"""\""""""
746# Unclosed block string
747"""\"""
748 "#;
749 let (tokens, errors) = Lexer::new(input).lex();
750 assert_eq!(tokens[3].data, r#""""\"""""""#);
751 assert_eq!(
752 errors,
753 &[Error::with_loc(
754 "unterminated string value",
755 r#""""\"""
756 "#
757 .to_string(),
758 59,
759 )]
760 );
761 }
762
763 #[test]
764 fn unexpected_character() {
765 let schema = r#"
766type Query {
767 name: String
768}
769/
770 "#;
771 let (tokens, errors) = Lexer::new(schema).lex();
772 dbg!(tokens);
773 assert_eq!(
774 errors,
775 &[Error::with_loc(
776 "Unexpected character \"/\"",
777 "/".to_string(),
778 33,
779 )]
780 );
781 }
782}