libgraphql_parser/token/str_to_graphql_token_source.rs
1//! A [`GraphQLTokenSource`] that lexes from a `&str` input.
2//!
3//! This lexer implements zero-copy lexing: token values borrow directly from
4//! the source string using `Cow::Borrowed`, avoiding allocations for names,
5//! numbers, and strings.
6//!
7//! # Features
8//!
9//! - **Zero-copy lexing**: Token values borrow from source text when possible
10//! - **Dual column tracking**: Reports both UTF-8 character positions (for
11//! display) and UTF-16 code unit positions (for LSP compatibility)
12//! - **Comment preservation**: GraphQL `#` comments are captured as trivia
13//! - **Error recovery**: Invalid characters emit `Error` tokens, allowing the
14//! lexer to continue and report multiple errors
15//!
16//! # Usage
17//!
18//! ```rust
19//! use libgraphql_parser::token::StrGraphQLTokenSource;
20//!
21//! let source = "{ name }";
22//! let lexer = StrGraphQLTokenSource::new(source);
23//! for token in lexer {
24//! println!("{:?}", token.kind);
25//! }
26//! // Output:
27//! // CurlyBraceOpen
28//! // Name(Borrowed("name"))
29//! // CurlyBraceClose
30//! // Eof
31//! ```
32
33use crate::ByteSpan;
34use crate::GraphQLErrorNote;
35use crate::SourceMap;
36use crate::SourceSpan;
37use crate::smallvec::smallvec;
38use crate::token::GraphQLToken;
39use crate::token::GraphQLTokenKind;
40use crate::token::GraphQLTriviaToken;
41use crate::token::GraphQLTriviaTokenVec;
42use crate::token::GraphQLTokenSource;
43use crate::token::StrGraphQLTokenSourceConfig;
44use std::borrow::Cow;
45use std::path::Path;
46use std::path::PathBuf;
47
48/// A [`GraphQLTokenSource`](crate::token::GraphQLTokenSource) that lexes from
49/// a `&str` input.
50///
51/// This lexer produces [`GraphQLToken`]s with zero-copy string values where
52/// possible. The `'src` lifetime ties token values to the source string.
53///
54/// See module documentation for details.
55pub struct StrGraphQLTokenSource<'src> {
56 /// The full source text being lexed.
57 source: &'src str,
58
59 /// Current byte offset from the start of `source`.
60 ///
61 /// The remaining text to lex is `&source[curr_byte_offset..]`.
62 curr_byte_offset: usize,
63
64 /// Trivia (comments, commas) accumulated before the next token.
65 pending_trivia: GraphQLTriviaTokenVec<'src>,
66
67 /// Whether the EOF token has been emitted.
68 finished: bool,
69
70 /// Maps byte offsets to line/column positions. Built via an O(n) pre-pass
71 /// in the constructor.
72 source_map: SourceMap<'src>,
73
74 /// Controls which trivia types (comments, commas, whitespace) are
75 /// captured on emitted tokens.
76 config: StrGraphQLTokenSourceConfig,
77}
78
79impl<'src> StrGraphQLTokenSource<'src> {
80 /// Creates a new token source from a string slice.
81 ///
82 /// # Example
83 ///
84 /// ```rust
85 /// # use libgraphql_parser::token::StrGraphQLTokenSource;
86 /// let lexer = StrGraphQLTokenSource::new("{ name }");
87 /// ```
88 pub fn new(source: &'src str) -> Self {
89 Self {
90 source,
91 curr_byte_offset: 0,
92 pending_trivia: smallvec![],
93 finished: false,
94 source_map: SourceMap::new_with_source(source, None),
95 config: StrGraphQLTokenSourceConfig::default(),
96 }
97 }
98
99 /// Creates a new token source with a custom trivia configuration.
100 ///
101 /// See [`StrGraphQLTokenSourceConfig`] for available options.
102 pub fn with_config(
103 source: &'src str,
104 config: StrGraphQLTokenSourceConfig,
105 ) -> Self {
106 Self {
107 source,
108 curr_byte_offset: 0,
109 pending_trivia: smallvec![],
110 finished: false,
111 source_map: SourceMap::new_with_source(source, None),
112 config,
113 }
114 }
115
116 /// Creates a new token source with an associated file path.
117 ///
118 /// The file path is included in token spans for error reporting.
119 pub fn with_file_path(source: &'src str, path: &'src Path) -> Self {
120 Self {
121 source,
122 curr_byte_offset: 0,
123 pending_trivia: smallvec![],
124 finished: false,
125 source_map: SourceMap::new_with_source(
126 source,
127 Some(PathBuf::from(path)),
128 ),
129 config: StrGraphQLTokenSourceConfig::default(),
130 }
131 }
132
133 // =========================================================================
134 // Position and scanning helpers
135 // =========================================================================
136
137 /// Returns the remaining source text to be lexed.
138 fn remaining(&self) -> &'src str {
139 &self.source[self.curr_byte_offset..]
140 }
141
142 /// Returns the current byte offset as a `u32`.
143 fn curr_offset(&self) -> u32 {
144 self.curr_byte_offset as u32
145 }
146
147 /// Peeks at the next character without consuming it.
148 ///
149 /// Returns `None` if at end of input.
150 ///
151 /// # Performance (B1 in benchmark-optimizations.md)
152 ///
153 /// This uses direct byte access with an ASCII fast path instead
154 /// of the naive `remaining().chars().next()`. GraphQL source text
155 /// is overwhelmingly ASCII (names, keywords, punctuators,
156 /// whitespace), so the fast path covers >99% of calls. The
157 /// non-ASCII fallback (Unicode in string literals/comments) is
158 /// rare and can remain slow.
159 ///
160 /// Without this optimization, every peek would construct a
161 /// `Chars` iterator and decode the first UTF-8 sequence — a
162 /// measurable cost given that peek is called millions of times
163 /// for large inputs.
164 #[inline]
165 fn peek_char(&self) -> Option<char> {
166 let bytes = self.source.as_bytes();
167 if self.curr_byte_offset >= bytes.len() {
168 return None;
169 }
170 let b = bytes[self.curr_byte_offset];
171 if b.is_ascii() {
172 // Fast path: single-byte ASCII character (covers >99%
173 // of GraphQL source text).
174 Some(b as char)
175 } else {
176 // Slow path: multi-byte UTF-8 character. Fall back to
177 // full UTF-8 decoding. This only triggers inside
178 // string literals or comments containing non-ASCII
179 // characters.
180 self.source[self.curr_byte_offset..].chars().next()
181 }
182 }
183
184 /// Peeks at the nth character ahead without consuming.
185 ///
186 /// `peek_char_nth(0)` is equivalent to `peek_char()`.
187 /// Returns `None` if there aren't enough characters remaining.
188 ///
189 /// Note: Unlike `peek_char()`, this still uses the iterator
190 /// approach since it needs to skip over variable-width UTF-8
191 /// characters to reach position n. This method is only called
192 /// in a few places for multi-character lookahead (e.g., number
193 /// parsing to check digit after `.`), so it is not a hot path.
194 fn peek_char_nth(&self, n: usize) -> Option<char> {
195 self.remaining().chars().nth(n)
196 }
197
198 /// Consumes the next character and advances the byte offset.
199 ///
200 /// Returns `None` if at end of input.
201 ///
202 /// # Performance (B1 in benchmark-optimizations.md)
203 ///
204 /// Uses an ASCII fast path: if the current byte is <0x80, we
205 /// know it is exactly 1 byte, so we avoid calling `ch.len_utf8()`.
206 /// Line/column tracking is deferred to the [`SourceMap`] (resolved
207 /// on demand), so `consume()` only updates `curr_byte_offset`.
208 fn consume(&mut self) -> Option<char> {
209 let bytes = self.source.as_bytes();
210 if self.curr_byte_offset >= bytes.len() {
211 return None;
212 }
213
214 let b = bytes[self.curr_byte_offset];
215
216 if b.is_ascii() {
217 self.curr_byte_offset += 1;
218 Some(b as char)
219 } else {
220 let ch = self.source[self.curr_byte_offset..]
221 .chars()
222 .next()
223 .unwrap();
224 self.curr_byte_offset += ch.len_utf8();
225 Some(ch)
226 }
227 }
228
229 /// Creates a [`ByteSpan`] from a start byte offset to the current
230 /// byte offset.
231 #[inline]
232 fn make_span(&self, start: u32) -> ByteSpan {
233 ByteSpan::new(start, self.curr_byte_offset as u32)
234 }
235
236 /// Resolves a `ByteSpan` to a `SourceSpan` using this token
237 /// source's `SourceMap`. Falls back to `SourceSpan::zero()` if
238 /// resolution fails.
239 fn resolve_span(&self, span: ByteSpan) -> SourceSpan {
240 self.source_map.resolve_span(span)
241 .unwrap_or_else(SourceSpan::zero)
242 }
243
244 // =========================================================================
245 // Token creation helpers
246 // =========================================================================
247
248 /// Creates a token with the accumulated trivia.
249 fn make_token(
250 &mut self,
251 kind: GraphQLTokenKind<'src>,
252 span: ByteSpan,
253 ) -> GraphQLToken<'src> {
254 GraphQLToken {
255 kind,
256 preceding_trivia: std::mem::take(&mut self.pending_trivia),
257 span,
258 }
259 }
260
261 // =========================================================================
262 // Lexer main loop
263 // =========================================================================
264
265 /// Advances to the next token, skipping whitespace and collecting trivia.
266 fn next_token(&mut self) -> GraphQLToken<'src> {
267 loop {
268 // Skip whitespace
269 self.skip_whitespace();
270
271 let start = self.curr_offset();
272
273 match self.peek_char() {
274 None => {
275 // End of input
276 let span = self.make_span(start);
277 return self.make_token(GraphQLTokenKind::Eof, span);
278 }
279
280 Some('#') => {
281 // Comment - collect as trivia and continue
282 self.lex_comment(start);
283 continue;
284 }
285
286 Some(',') => {
287 // Comma - collect as trivia and continue
288 self.consume();
289 if self.config.retain_commas {
290 let span = self.make_span(start);
291 self.pending_trivia
292 .push(GraphQLTriviaToken::Comma { span });
293 }
294 continue;
295 }
296
297 // Single-character punctuators
298 Some('!') => {
299 self.consume();
300 let span = self.make_span(start);
301 return self.make_token(GraphQLTokenKind::Bang, span);
302 }
303 Some('$') => {
304 self.consume();
305 let span = self.make_span(start);
306 return self.make_token(GraphQLTokenKind::Dollar, span);
307 }
308 Some('&') => {
309 self.consume();
310 let span = self.make_span(start);
311 return self.make_token(GraphQLTokenKind::Ampersand, span);
312 }
313 Some('(') => {
314 self.consume();
315 let span = self.make_span(start);
316 return self.make_token(GraphQLTokenKind::ParenOpen, span);
317 }
318 Some(')') => {
319 self.consume();
320 let span = self.make_span(start);
321 return self.make_token(GraphQLTokenKind::ParenClose, span);
322 }
323 Some(':') => {
324 self.consume();
325 let span = self.make_span(start);
326 return self.make_token(GraphQLTokenKind::Colon, span);
327 }
328 Some('=') => {
329 self.consume();
330 let span = self.make_span(start);
331 return self.make_token(GraphQLTokenKind::Equals, span);
332 }
333 Some('@') => {
334 self.consume();
335 let span = self.make_span(start);
336 return self.make_token(GraphQLTokenKind::At, span);
337 }
338 Some('[') => {
339 self.consume();
340 let span = self.make_span(start);
341 return self.make_token(GraphQLTokenKind::SquareBracketOpen, span);
342 }
343 Some(']') => {
344 self.consume();
345 let span = self.make_span(start);
346 return self.make_token(GraphQLTokenKind::SquareBracketClose, span);
347 }
348 Some('{') => {
349 self.consume();
350 let span = self.make_span(start);
351 return self.make_token(GraphQLTokenKind::CurlyBraceOpen, span);
352 }
353 Some('}') => {
354 self.consume();
355 let span = self.make_span(start);
356 return self.make_token(GraphQLTokenKind::CurlyBraceClose, span);
357 }
358 Some('|') => {
359 self.consume();
360 let span = self.make_span(start);
361 return self.make_token(GraphQLTokenKind::Pipe, span);
362 }
363
364 // Ellipsis or dot error
365 Some('.') => {
366 return self.lex_dot_or_ellipsis(start);
367 }
368
369 // String literals
370 Some('"') => {
371 return self.lex_string(start);
372 }
373
374 // Names and keywords
375 Some(c) if is_name_start(c) => {
376 return self.lex_name(start);
377 }
378
379 // Numbers (including negative)
380 Some(c) if c == '-' || c.is_ascii_digit() => {
381 return self.lex_number(start);
382 }
383
384 // Invalid character
385 Some(_) => {
386 return self.lex_invalid_character(start);
387 }
388 }
389 }
390 }
391
392 // =========================================================================
393 // Whitespace handling
394 // =========================================================================
395
396 /// Skips whitespace characters.
397 ///
398 /// Per the GraphQL spec, these are "ignored tokens":
399 /// - Space (U+0020)
400 /// - Tab (U+0009)
401 /// - Line terminators: LF (U+000A), CR (U+000D), CRLF
402 /// - BOM (U+FEFF) - Unicode BOM is ignored anywhere in the document
403 ///
404 /// See: <https://spec.graphql.org/September2025/#sec-Language.Source-Text.Unicode>
405 ///
406 /// Note: Comma is also whitespace in GraphQL but we handle it separately
407 /// to preserve it as trivia.
408 ///
409 /// # Performance (B2 in benchmark-optimizations.md)
410 ///
411 /// Uses byte-scanning instead of per-character `consume()`
412 /// calls, doing one branch per byte and a single
413 /// `curr_byte_offset` update at the end.
414 fn skip_whitespace(&mut self) {
415 let bytes = self.source.as_bytes();
416 let start_byte_offset = self.curr_byte_offset;
417 let retain = self.config.retain_whitespace;
418 let start = if retain {
419 Some(self.curr_offset())
420 } else {
421 None
422 };
423
424 let mut i = self.curr_byte_offset;
425
426 loop {
427 if i >= bytes.len() {
428 break;
429 }
430 match bytes[i] {
431 b' ' | b'\t' | b'\n' | b'\r' => {
432 i += 1;
433 },
434 // BOM: U+FEFF = 0xEF 0xBB 0xBF in UTF-8.
435 0xEF if i + 2 < bytes.len()
436 && bytes[i + 1] == 0xBB
437 && bytes[i + 2] == 0xBF => {
438 i += 3;
439 },
440 _ => break,
441 }
442 }
443
444 if i == self.curr_byte_offset {
445 return;
446 }
447
448 self.curr_byte_offset = i;
449
450 // Capture the whitespace run as trivia if configured.
451 if let Some(ws_start) = start {
452 let value = &self.source[start_byte_offset..i];
453 let span = self.make_span(ws_start);
454 self.pending_trivia.push(
455 GraphQLTriviaToken::Whitespace {
456 value: Cow::Borrowed(value),
457 span,
458 },
459 );
460 }
461 }
462
463 // =========================================================================
464 // Comment lexing
465 // =========================================================================
466
467 /// Lexes a comment and adds it to pending trivia.
468 ///
469 /// A comment starts with `#` and extends to the end of the line.
470 ///
471 /// # Performance (B2 in benchmark-optimizations.md)
472 ///
473 /// Uses byte-scanning to find end-of-line instead of
474 /// per-character `peek_char()` + `consume()`. Comments never
475 /// span multiple lines, so line number doesn't change — only
476 /// the column advances. Column is computed once at the end
477 /// via `compute_columns_for_span()` (with an ASCII fast path
478 /// for the common case).
479 fn lex_comment(&mut self, start: u32) {
480 // Consume the '#' (single ASCII byte).
481 self.curr_byte_offset += 1;
482
483 let content_start = self.curr_byte_offset;
484 let bytes = self.source.as_bytes();
485
486 // SIMD-accelerated scan to end of line or EOF.
487 let i = memchr::memchr2(b'\n', b'\r', &bytes[content_start..])
488 .map_or(bytes.len(), |offset| content_start + offset);
489
490 self.curr_byte_offset = i;
491
492 if self.config.retain_comments {
493 let content = &self.source[content_start..i];
494 let span = self.make_span(start);
495 self.pending_trivia.push(
496 GraphQLTriviaToken::Comment {
497 value: Cow::Borrowed(content),
498 span,
499 },
500 );
501 }
502 }
503
504 // =========================================================================
505 // Dot / Ellipsis lexing
506 // =========================================================================
507
508 /// Lexes dots, producing either an Ellipsis token or an error.
509 ///
510 /// This implements a state machine for dot handling similar to
511 /// `RustMacroGraphQLTokenSource`:
512 /// - `...` (adjacent) → `Ellipsis`
513 /// - `.` alone → Error (no hint - could be many things like `Foo.Bar`)
514 /// - `..` (adjacent) → Error with help to add third dot
515 /// - `. .` (spaced, same line) → Error with help about spacing
516 /// - `.. .` (first two adjacent, third spaced) → Error with help about
517 /// spacing
518 /// - `. ..` (first spaced, last two adjacent) → Error with help about
519 /// spacing
520 /// - `. . .` (all spaced, same line) → Error with help about spacing
521 /// - Dots on different lines → Separate errors
522 ///
523 /// TODO: Look for patterns like `{Name}.{Name}` and give a useful error
524 /// hint (e.g., user may have been trying to use enum syntax incorrectly).
525 fn lex_dot_or_ellipsis(&mut self, start: u32) -> GraphQLToken<'src> {
526 // Consume first dot
527 self.consume();
528
529 // Check for second dot (may be adjacent or spaced on the same line).
530 // `skip_whitespace_same_line()` never crosses newlines, so if the
531 // next char after skipping is not a dot, we fall through to the
532 // single-dot error case.
533 self.skip_whitespace_same_line();
534
535 match self.peek_char() {
536 Some('.') => {
537 let second_dot_start = self.curr_offset();
538 let first_two_adjacent = second_dot_start == start + 1;
539 self.consume();
540
541 // Check for third dot
542 self.skip_whitespace_same_line();
543
544 match self.peek_char() {
545 Some('.') => {
546 let third_dot_start = self.curr_offset();
547 self.consume();
548 let span = self.make_span(start);
549
550 // Check if all three dots were adjacent (no whitespace)
551 let second_third_adjacent =
552 third_dot_start == second_dot_start + 1;
553
554 if first_two_adjacent && second_third_adjacent {
555 // All adjacent - valid ellipsis
556 self.make_token(GraphQLTokenKind::Ellipsis, span)
557 } else if first_two_adjacent {
558 // `.. .` - first two adjacent, third spaced
559 let kind = GraphQLTokenKind::error(
560 "Unexpected `.. .`",
561 smallvec![GraphQLErrorNote::help(
562 "This `.` may have been intended to complete a `...` \
563 spread operator. Try removing the extra spacing \
564 between the dots."
565 )],
566 );
567 self.make_token(kind, span)
568 } else if second_third_adjacent {
569 // `. ..` - first spaced, last two adjacent
570 let kind = GraphQLTokenKind::error(
571 "Unexpected `. ..`",
572 smallvec![GraphQLErrorNote::help(
573 "These dots may have been intended to form a `...` \
574 spread operator. Try removing the extra spacing \
575 between the dots."
576 )],
577 );
578 self.make_token(kind, span)
579 } else {
580 // `. . .` - all spaced
581 let kind = GraphQLTokenKind::error(
582 "Unexpected `. . .`",
583 smallvec![GraphQLErrorNote::help(
584 "These dots may have been intended to form a `...` \
585 spread operator. Try removing the extra spacing \
586 between the dots."
587 )],
588 );
589 self.make_token(kind, span)
590 }
591 }
592 _ => {
593 // Only two dots found on this line
594 let span = self.make_span(start);
595
596 if first_two_adjacent {
597 // Adjacent `..` - suggest adding third dot
598 let kind = GraphQLTokenKind::error(
599 "Unexpected `..` (use `...` for spread operator)",
600 smallvec![GraphQLErrorNote::help(
601 "Add one more `.` to form the spread operator `...`"
602 )],
603 );
604 self.make_token(kind, span)
605 } else {
606 // Spaced `. .` - suggest removing spacing
607 let kind = GraphQLTokenKind::error(
608 "Unexpected `. .` (use `...` for spread operator)",
609 smallvec![GraphQLErrorNote::help(
610 "These dots may have been intended to form a `...` \
611 spread operator. Try removing the extra spacing \
612 between the dots."
613 )],
614 );
615 self.make_token(kind, span)
616 }
617 }
618 }
619 }
620 _ => {
621 // Single dot (or dots on different lines)
622 // Don't assume it was meant to be ellipsis - could be `Foo.Bar` style
623 let span = self.make_span(start);
624 let kind = GraphQLTokenKind::error("Unexpected `.`", smallvec![]);
625 self.make_token(kind, span)
626 }
627 }
628 }
629
630 /// Skips whitespace but only on the same line.
631 ///
632 /// Used for dot consolidation - we only merge dots that are on the same
633 /// line.
634 fn skip_whitespace_same_line(&mut self) {
635 while let Some(ch) = self.peek_char() {
636 match ch {
637 ' ' | '\t' | '\u{FEFF}' => {
638 self.consume();
639 }
640 _ => break,
641 }
642 }
643 }
644
645 // =========================================================================
646 // Name lexing
647 // =========================================================================
648
649 /// Lexes a name or keyword.
650 ///
651 /// Names match the pattern: `/[_A-Za-z][_0-9A-Za-z]*/`
652 ///
653 /// Keywords `true`, `false`, and `null` are emitted as distinct token
654 /// kinds.
655 ///
656 /// # Performance (B2 in benchmark-optimizations.md)
657 ///
658 /// Uses byte-scanning to find the end of the name in a tight
659 /// loop (one byte comparison per iteration), then updates
660 /// `curr_byte_offset` once for the entire name.
661 fn lex_name(&mut self, start: u32) -> GraphQLToken<'src> {
662 let name_start = self.curr_byte_offset;
663 let bytes = self.source.as_bytes();
664
665 // Byte-scan: skip first char (already validated as name
666 // start) and continue while bytes match [_0-9A-Za-z].
667 let mut i = name_start + 1;
668 while i < bytes.len() && is_name_continue_byte(bytes[i]) {
669 i += 1;
670 }
671
672 self.curr_byte_offset = i;
673
674 let name = &self.source[name_start..i];
675 let span = self.make_span(start);
676
677 // Check for keywords
678 let kind = match name {
679 "true" => GraphQLTokenKind::True,
680 "false" => GraphQLTokenKind::False,
681 "null" => GraphQLTokenKind::Null,
682 _ => GraphQLTokenKind::name_borrowed(name),
683 };
684
685 self.make_token(kind, span)
686 }
687
688 // =========================================================================
689 // Number lexing
690 // =========================================================================
691
692 /// Lexes an integer or float literal.
693 ///
694 /// Handles:
695 /// - Optional negative sign: `-`
696 /// - Integer part: `0` or `[1-9][0-9]*`
697 /// - Optional decimal part: `.[0-9]+`
698 /// - Optional exponent: `[eE][+-]?[0-9]+`
699 fn lex_number(&mut self, start: u32) -> GraphQLToken<'src> {
700 let num_start = self.curr_byte_offset;
701 let mut is_float = false;
702
703 // Optional negative sign
704 if self.peek_char() == Some('-') {
705 self.consume();
706 }
707
708 // Integer part
709 match self.peek_char() {
710 Some('0') => {
711 self.consume();
712 // Check for invalid leading zeros (e.g., 00, 01)
713 if let Some(ch) = self.peek_char()
714 && ch.is_ascii_digit() {
715 // Invalid: leading zeros
716 return self.lex_number_error(
717 start,
718 num_start,
719 "Invalid number: leading zeros are not allowed",
720 Some("https://spec.graphql.org/September2025/#sec-Int-Value"),
721 );
722 }
723 }
724 Some(ch) if ch.is_ascii_digit() => {
725 // Non-zero start
726 self.consume();
727 while let Some(ch) = self.peek_char() {
728 if ch.is_ascii_digit() {
729 self.consume();
730 } else {
731 break;
732 }
733 }
734 }
735 Some(_) | None => {
736 // Just a `-` with no digits
737 let span = self.make_span(start);
738 let kind = GraphQLTokenKind::error("Unexpected `-`", smallvec![]);
739 return self.make_token(kind, span);
740 }
741 }
742
743 // Optional decimal part
744 if self.peek_char() == Some('.') {
745 // Check that the next character is a digit (not another dot for `...`)
746 if let Some(ch) = self.peek_char_nth(1)
747 && ch.is_ascii_digit() {
748 is_float = true;
749 self.consume(); // consume the '.'
750
751 // Consume decimal digits
752 while let Some(ch) = self.peek_char() {
753 if ch.is_ascii_digit() {
754 self.consume();
755 } else {
756 break;
757 }
758 }
759 }
760 }
761
762 // Optional exponent part
763 if let Some(ch) = self.peek_char()
764 && (ch == 'e' || ch == 'E') {
765 is_float = true;
766 self.consume();
767
768 // Optional sign
769 if let Some(ch) = self.peek_char()
770 && (ch == '+' || ch == '-') {
771 self.consume();
772 }
773
774 // Exponent digits (required)
775 let has_exponent_digits = matches!(self.peek_char(), Some(ch) if ch.is_ascii_digit());
776 if !has_exponent_digits {
777 return self.lex_number_error(
778 start,
779 num_start,
780 "Invalid number: exponent must have at least one digit",
781 Some("https://spec.graphql.org/September2025/#sec-Float-Value"),
782 );
783 }
784
785 while let Some(ch) = self.peek_char() {
786 if ch.is_ascii_digit() {
787 self.consume();
788 } else {
789 break;
790 }
791 }
792 }
793
794 let num_end = self.curr_byte_offset;
795 let num_text = &self.source[num_start..num_end];
796 let span = self.make_span(start);
797
798 let kind = if is_float {
799 GraphQLTokenKind::float_value_borrowed(num_text)
800 } else {
801 GraphQLTokenKind::int_value_borrowed(num_text)
802 };
803
804 self.make_token(kind, span)
805 }
806
807 /// Creates an error token for an invalid number.
808 fn lex_number_error(
809 &mut self,
810 start: u32,
811 num_start: usize,
812 message: &str,
813 spec_url: Option<&str>,
814 ) -> GraphQLToken<'src> {
815 // Consume remaining number-like characters to provide better error recovery
816 while let Some(ch) = self.peek_char() {
817 if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' {
818 self.consume();
819 } else {
820 break;
821 }
822 }
823
824 let num_end = self.curr_byte_offset;
825 let invalid_text = &self.source[num_start..num_end];
826 let span = self.make_span(start);
827
828 let mut error_notes = smallvec![];
829 if let Some(url) = spec_url {
830 error_notes.push(GraphQLErrorNote::spec(url));
831 }
832
833 let kind = GraphQLTokenKind::error(
834 format!("{message}: `{invalid_text}`"),
835 error_notes,
836 );
837
838 self.make_token(kind, span)
839 }
840
841 // =========================================================================
842 // String lexing
843 // =========================================================================
844
845 /// Creates an error token for an unescaped newline in a single-line
846 /// string. Shared by the \n and \r error paths in `lex_string()`.
847 fn lex_string_newline_error(&mut self, start: u32) -> GraphQLToken<'src> {
848 let span = self.make_span(start);
849 let kind = GraphQLTokenKind::error(
850 "Unterminated string literal",
851 smallvec![
852 GraphQLErrorNote::general(
853 "Single-line strings cannot contain unescaped newlines"
854 ),
855 GraphQLErrorNote::help(
856 "Use a block string (triple quotes) for multi-line \
857 strings, or escape the newline with `\\n`"
858 ),
859 ],
860 );
861 self.make_token(kind, span)
862 }
863
864 /// Lexes a string literal (single-line or block string).
865 fn lex_string(&mut self, start: u32) -> GraphQLToken<'src> {
866 let str_start = self.curr_byte_offset;
867
868 // Check for block string (""")
869 if self.remaining().starts_with("\"\"\"") {
870 return self.lex_block_string(start, str_start);
871 }
872
873 // Single-line string — byte-scan with SIMD-accelerated
874 // sentinel search via memchr3. The three sentinel bytes:
875 // b'"' — end of string
876 // b'\\' — escape sequence
877 // b'\n' — error (unescaped newline)
878 //
879 // For \r we check the byte immediately before each \n
880 // match (to handle \r\n), and we also check the gap
881 // between the current position and the match for any
882 // bare \r. Bare \r is extremely rare in practice so
883 // the memchr call in the gap almost never fires.
884 //
885 // This is safe for multi-byte UTF-8 because all
886 // sentinels are ASCII (<0x80) and can never appear as
887 // continuation bytes in multi-byte sequences (>=0x80).
888 let bytes = self.source.as_bytes();
889 let mut i = self.curr_byte_offset + 1; // skip opening "
890
891 loop {
892 match memchr::memchr3(b'"', b'\\', b'\n', &bytes[i..]) {
893 None => {
894 // Before declaring EOF, check if there's a
895 // \r in the remaining bytes.
896 if let Some(cr_off) =
897 memchr::memchr(b'\r', &bytes[i..])
898 {
899 i += cr_off + 1;
900 if i < bytes.len() && bytes[i] == b'\n' {
901 i += 1;
902 }
903 self.curr_byte_offset = i;
904 return self.lex_string_newline_error(start);
905 }
906 // Hit EOF without closing quote
907 self.curr_byte_offset = bytes.len();
908 let span = self.make_span(start);
909 let kind = GraphQLTokenKind::error(
910 "Unterminated string literal",
911 smallvec![
912 GraphQLErrorNote::general_with_span(
913 "String started here",
914 self.resolve_span(span),
915 ),
916 GraphQLErrorNote::help("Add closing `\"`"),
917 ],
918 );
919 return self.make_token(kind, span);
920 },
921 Some(offset) => {
922 // Check for bare \r in the gap [i..i+offset)
923 if let Some(cr_off) =
924 memchr::memchr(b'\r', &bytes[i..i + offset])
925 {
926 i += cr_off + 1;
927 if i < bytes.len() && bytes[i] == b'\n' {
928 i += 1;
929 }
930 self.curr_byte_offset = i;
931 return self.lex_string_newline_error(start);
932 }
933
934 i += offset;
935 match bytes[i] {
936 b'"' => {
937 // End of string
938 i += 1;
939 break;
940 },
941 b'\\' => {
942 // Escape sequence — skip backslash +
943 // next byte (which could be `"` or `\`)
944 i += 1;
945 if i < bytes.len() {
946 i += 1;
947 }
948 },
949 b'\n' => {
950 // Bare \n — any preceding \r would have
951 // been caught by the gap-check above
952 i += 1;
953 self.curr_byte_offset = i;
954 return self.lex_string_newline_error(
955 start,
956 );
957 },
958 _ => unreachable!(),
959 }
960 },
961 }
962 }
963
964 self.curr_byte_offset = i;
965 let str_end = self.curr_byte_offset;
966 let string_text = &self.source[str_start..str_end];
967 let span = self.make_span(start);
968
969 self.make_token(GraphQLTokenKind::string_value_borrowed(string_text), span)
970 }
971
972 /// Lexes a block string literal.
973 ///
974 /// # Performance (B2 in benchmark-optimizations.md)
975 ///
976 /// Uses byte-scanning instead of per-character
977 /// `peek_char()`/`consume()` calls. The scan loop checks
978 /// each byte against the special characters (`"`, `\`, `\n`,
979 /// `\r`) and skips everything else with a single `i += 1`.
980 ///
981 /// This is safe for multi-byte UTF-8 content because the
982 /// sentinel bytes (`"` = 0x22, `\` = 0x5C, `\n` = 0x0A,
983 /// `\r` = 0x0D) are all ASCII (<0x80) and can never appear
984 /// as continuation bytes in multi-byte UTF-8 sequences
985 /// (which are always >=0x80).
986 fn lex_block_string(
987 &mut self,
988 start: u32,
989 str_start: usize,
990 ) -> GraphQLToken<'src> {
991 let bytes = self.source.as_bytes();
992
993 // Skip opening """ (3 ASCII bytes, caller verified).
994 let mut i = self.curr_byte_offset + 3;
995
996 // SIMD-accelerated scan: jump to the next `"` or `\`
997 // instead of advancing byte-by-byte through
998 // documentation text. Block string bodies are typically
999 // long runs of text where neither sentinel appears.
1000 let found_close = loop {
1001 match memchr::memchr2(b'"', b'\\', &bytes[i..]) {
1002 None => {
1003 i = bytes.len();
1004 break false;
1005 },
1006 Some(offset) => {
1007 i += offset;
1008 match bytes[i] {
1009 b'"' if i + 2 < bytes.len()
1010 && bytes[i + 1] == b'"'
1011 && bytes[i + 2] == b'"' =>
1012 {
1013 // Closing """.
1014 i += 3;
1015 break true;
1016 },
1017 b'\\' if i + 3 < bytes.len()
1018 && bytes[i + 1] == b'"'
1019 && bytes[i + 2] == b'"'
1020 && bytes[i + 3] == b'"' =>
1021 {
1022 // Escaped triple quote \""".
1023 i += 4;
1024 },
1025 _ => {
1026 // Lone `"` or `\` — not a
1027 // terminator, skip past it.
1028 i += 1;
1029 },
1030 }
1031 },
1032 }
1033 };
1034
1035 self.curr_byte_offset = i;
1036
1037 if !found_close {
1038 // Unterminated block string.
1039 let span = self.make_span(start);
1040 let kind = GraphQLTokenKind::error(
1041 "Unterminated block string",
1042 smallvec![
1043 GraphQLErrorNote::general_with_span(
1044 "Block string started here",
1045 self.resolve_span(span),
1046 ),
1047 GraphQLErrorNote::help("Add closing `\"\"\"`"),
1048 ],
1049 );
1050 return self.make_token(kind, span);
1051 }
1052
1053 let str_end = self.curr_byte_offset;
1054 let string_text = &self.source[str_start..str_end];
1055 let span = self.make_span(start);
1056
1057 self.make_token(
1058 GraphQLTokenKind::string_value_borrowed(string_text),
1059 span,
1060 )
1061 }
1062
1063 // =========================================================================
1064 // Invalid character handling
1065 // =========================================================================
1066
1067 /// Lexes an invalid character, producing an error token.
1068 fn lex_invalid_character(&mut self, start: u32) -> GraphQLToken<'src> {
1069 let ch = self.consume().unwrap();
1070 let span = self.make_span(start);
1071
1072 let kind = GraphQLTokenKind::error(
1073 format!("Unexpected character {}", describe_char(ch)),
1074 smallvec![],
1075 );
1076
1077 self.make_token(kind, span)
1078 }
1079}
1080
1081// =============================================================================
1082// Iterator implementation
1083// =============================================================================
1084
1085impl<'src> Iterator for StrGraphQLTokenSource<'src> {
1086 type Item = GraphQLToken<'src>;
1087
1088 fn next(&mut self) -> Option<Self::Item> {
1089 if self.finished {
1090 return None;
1091 }
1092
1093 let token = self.next_token();
1094
1095 if matches!(token.kind, GraphQLTokenKind::Eof) {
1096 self.finished = true;
1097 }
1098
1099 Some(token)
1100 }
1101}
1102
1103impl<'src> GraphQLTokenSource<'src> for StrGraphQLTokenSource<'src> {
1104 fn source_map(&self) -> &SourceMap<'src> {
1105 &self.source_map
1106 }
1107
1108 fn into_source_map(self) -> SourceMap<'src> {
1109 self.source_map
1110 }
1111}
1112
1113// =============================================================================
1114// Helper functions
1115// =============================================================================
1116
1117/// Returns `true` if `ch` can start a GraphQL name.
1118///
1119/// Per the GraphQL spec, names start with `NameStart`:
1120/// <https://spec.graphql.org/September2025/#NameStart>
1121fn is_name_start(ch: char) -> bool {
1122 ch == '_' || ch.is_ascii_alphabetic()
1123}
1124
1125/// 256-byte lookup table for GraphQL NameContinue classification.
1126///
1127/// Indexed by byte value. `true` for `_` (0x5F), `0`–`9` (0x30–0x39),
1128/// `A`–`Z` (0x41–0x5A), `a`–`z` (0x61–0x7A). All other bytes are
1129/// `false`, including non-ASCII (>=0x80) which is correct since
1130/// GraphQL names are ASCII-only per spec.
1131const NAME_CONTINUE_TABLE: [bool; 256] = {
1132 let mut table = [false; 256];
1133 let mut i = 0u16;
1134 while i < 256 {
1135 let b = i as u8;
1136 table[i as usize] = matches!(
1137 b, b'_' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z'
1138 );
1139 i += 1;
1140 }
1141 table
1142};
1143
1144/// Returns `true` if `b` can continue a GraphQL name.
1145///
1146/// Per the GraphQL spec, names continue with `NameContinue`:
1147/// <https://spec.graphql.org/September2025/#NameContinue>
1148///
1149/// Uses a lookup table for branchless O(1) classification in the
1150/// tight `lex_name()` scanning loop (see B21 in
1151/// benchmark-optimizations.md). Non-ASCII bytes (>=0x80) always
1152/// return false, which is correct since GraphQL names are
1153/// ASCII-only by spec.
1154#[inline]
1155fn is_name_continue_byte(b: u8) -> bool {
1156 NAME_CONTINUE_TABLE[b as usize]
1157}
1158
1159/// Returns a human-readable description of a character for error messages.
1160///
1161/// For printable characters, returns the character in backticks.
1162/// For invisible/control characters, includes Unicode code point description.
1163fn describe_char(ch: char) -> String {
1164 if ch.is_control() || (ch.is_whitespace() && ch != ' ') {
1165 // Invisible characters get detailed description
1166 let name = unicode_char_name(ch);
1167 if let Some(name) = name {
1168 format!("`{}` (U+{:04X}: {})", ch, ch as u32, name)
1169 } else {
1170 format!("`{}` (U+{:04X})", ch, ch as u32)
1171 }
1172 } else {
1173 format!("`{ch}`")
1174 }
1175}
1176
1177/// Returns the Unicode name for well-known invisible/control characters.
1178///
1179/// This provides meaningful names for commonly encountered invisible
1180/// characters. Returns `None` for characters without a known name.
1181fn unicode_char_name(ch: char) -> Option<&'static str> {
1182 match ch {
1183 // C0 control characters (U+0000 - U+001F)
1184 '\u{0000}' => Some("NULL"),
1185 '\u{0001}' => Some("START OF HEADING"),
1186 '\u{0002}' => Some("START OF TEXT"),
1187 '\u{0003}' => Some("END OF TEXT"),
1188 '\u{0004}' => Some("END OF TRANSMISSION"),
1189 '\u{0005}' => Some("ENQUIRY"),
1190 '\u{0006}' => Some("ACKNOWLEDGE"),
1191 '\u{0007}' => Some("BELL"),
1192 '\u{0008}' => Some("BACKSPACE"),
1193 '\u{0009}' => Some("HORIZONTAL TAB"),
1194 '\u{000A}' => Some("LINE FEED"),
1195 '\u{000B}' => Some("VERTICAL TAB"),
1196 '\u{000C}' => Some("FORM FEED"),
1197 '\u{000D}' => Some("CARRIAGE RETURN"),
1198 '\u{000E}' => Some("SHIFT OUT"),
1199 '\u{000F}' => Some("SHIFT IN"),
1200 '\u{0010}' => Some("DATA LINK ESCAPE"),
1201 '\u{0011}' => Some("DEVICE CONTROL ONE"),
1202 '\u{0012}' => Some("DEVICE CONTROL TWO"),
1203 '\u{0013}' => Some("DEVICE CONTROL THREE"),
1204 '\u{0014}' => Some("DEVICE CONTROL FOUR"),
1205 '\u{0015}' => Some("NEGATIVE ACKNOWLEDGE"),
1206 '\u{0016}' => Some("SYNCHRONOUS IDLE"),
1207 '\u{0017}' => Some("END OF TRANSMISSION BLOCK"),
1208 '\u{0018}' => Some("CANCEL"),
1209 '\u{0019}' => Some("END OF MEDIUM"),
1210 '\u{001A}' => Some("SUBSTITUTE"),
1211 '\u{001B}' => Some("ESCAPE"),
1212 '\u{001C}' => Some("FILE SEPARATOR"),
1213 '\u{001D}' => Some("GROUP SEPARATOR"),
1214 '\u{001E}' => Some("RECORD SEPARATOR"),
1215 '\u{001F}' => Some("UNIT SEPARATOR"),
1216
1217 // C1 control characters and special (U+007F - U+00A0)
1218 '\u{007F}' => Some("DELETE"),
1219 '\u{0080}' => Some("PADDING CHARACTER"),
1220 '\u{0081}' => Some("HIGH OCTET PRESET"),
1221 '\u{0082}' => Some("BREAK PERMITTED HERE"),
1222 '\u{0083}' => Some("NO BREAK HERE"),
1223 '\u{0084}' => Some("INDEX"),
1224 '\u{0085}' => Some("NEXT LINE"),
1225 '\u{0086}' => Some("START OF SELECTED AREA"),
1226 '\u{0087}' => Some("END OF SELECTED AREA"),
1227 '\u{0088}' => Some("CHARACTER TABULATION SET"),
1228 '\u{0089}' => Some("CHARACTER TABULATION WITH JUSTIFICATION"),
1229 '\u{008A}' => Some("LINE TABULATION SET"),
1230 '\u{008B}' => Some("PARTIAL LINE FORWARD"),
1231 '\u{008C}' => Some("PARTIAL LINE BACKWARD"),
1232 '\u{008D}' => Some("REVERSE LINE FEED"),
1233 '\u{008E}' => Some("SINGLE SHIFT TWO"),
1234 '\u{008F}' => Some("SINGLE SHIFT THREE"),
1235 '\u{0090}' => Some("DEVICE CONTROL STRING"),
1236 '\u{0091}' => Some("PRIVATE USE ONE"),
1237 '\u{0092}' => Some("PRIVATE USE TWO"),
1238 '\u{0093}' => Some("SET TRANSMIT STATE"),
1239 '\u{0094}' => Some("CANCEL CHARACTER"),
1240 '\u{0095}' => Some("MESSAGE WAITING"),
1241 '\u{0096}' => Some("START OF GUARDED AREA"),
1242 '\u{0097}' => Some("END OF GUARDED AREA"),
1243 '\u{0098}' => Some("START OF STRING"),
1244 '\u{0099}' => Some("SINGLE GRAPHIC CHARACTER INTRODUCER"),
1245 '\u{009A}' => Some("SINGLE CHARACTER INTRODUCER"),
1246 '\u{009B}' => Some("CONTROL SEQUENCE INTRODUCER"),
1247 '\u{009C}' => Some("STRING TERMINATOR"),
1248 '\u{009D}' => Some("OPERATING SYSTEM COMMAND"),
1249 '\u{009E}' => Some("PRIVACY MESSAGE"),
1250 '\u{009F}' => Some("APPLICATION PROGRAM COMMAND"),
1251 '\u{00A0}' => Some("NO-BREAK SPACE"),
1252 '\u{00AD}' => Some("SOFT HYPHEN"),
1253
1254 // General punctuation - spaces (U+2000 - U+200A)
1255 '\u{2000}' => Some("EN QUAD"),
1256 '\u{2001}' => Some("EM QUAD"),
1257 '\u{2002}' => Some("EN SPACE"),
1258 '\u{2003}' => Some("EM SPACE"),
1259 '\u{2004}' => Some("THREE-PER-EM SPACE"),
1260 '\u{2005}' => Some("FOUR-PER-EM SPACE"),
1261 '\u{2006}' => Some("SIX-PER-EM SPACE"),
1262 '\u{2007}' => Some("FIGURE SPACE"),
1263 '\u{2008}' => Some("PUNCTUATION SPACE"),
1264 '\u{2009}' => Some("THIN SPACE"),
1265 '\u{200A}' => Some("HAIR SPACE"),
1266
1267 // Zero-width and formatting characters (U+200B - U+200F)
1268 '\u{200B}' => Some("ZERO WIDTH SPACE"),
1269 '\u{200C}' => Some("ZERO WIDTH NON-JOINER"),
1270 '\u{200D}' => Some("ZERO WIDTH JOINER"),
1271 '\u{200E}' => Some("LEFT-TO-RIGHT MARK"),
1272 '\u{200F}' => Some("RIGHT-TO-LEFT MARK"),
1273
1274 // Bidirectional text formatting (U+202A - U+202F)
1275 '\u{202A}' => Some("LEFT-TO-RIGHT EMBEDDING"),
1276 '\u{202B}' => Some("RIGHT-TO-LEFT EMBEDDING"),
1277 '\u{202C}' => Some("POP DIRECTIONAL FORMATTING"),
1278 '\u{202D}' => Some("LEFT-TO-RIGHT OVERRIDE"),
1279 '\u{202E}' => Some("RIGHT-TO-LEFT OVERRIDE"),
1280 '\u{202F}' => Some("NARROW NO-BREAK SPACE"),
1281
1282 // More formatting (U+2060 - U+206F)
1283 '\u{2060}' => Some("WORD JOINER"),
1284 '\u{2061}' => Some("FUNCTION APPLICATION"),
1285 '\u{2062}' => Some("INVISIBLE TIMES"),
1286 '\u{2063}' => Some("INVISIBLE SEPARATOR"),
1287 '\u{2064}' => Some("INVISIBLE PLUS"),
1288 '\u{2066}' => Some("LEFT-TO-RIGHT ISOLATE"),
1289 '\u{2067}' => Some("RIGHT-TO-LEFT ISOLATE"),
1290 '\u{2068}' => Some("FIRST STRONG ISOLATE"),
1291 '\u{2069}' => Some("POP DIRECTIONAL ISOLATE"),
1292 '\u{206A}' => Some("INHIBIT SYMMETRIC SWAPPING"),
1293 '\u{206B}' => Some("ACTIVATE SYMMETRIC SWAPPING"),
1294 '\u{206C}' => Some("INHIBIT ARABIC FORM SHAPING"),
1295 '\u{206D}' => Some("ACTIVATE ARABIC FORM SHAPING"),
1296 '\u{206E}' => Some("NATIONAL DIGIT SHAPES"),
1297 '\u{206F}' => Some("NOMINAL DIGIT SHAPES"),
1298
1299 // Other special spaces
1300 '\u{2028}' => Some("LINE SEPARATOR"),
1301 '\u{2029}' => Some("PARAGRAPH SEPARATOR"),
1302 '\u{205F}' => Some("MEDIUM MATHEMATICAL SPACE"),
1303 '\u{3000}' => Some("IDEOGRAPHIC SPACE"),
1304
1305 // Special characters
1306 '\u{034F}' => Some("COMBINING GRAPHEME JOINER"),
1307 '\u{061C}' => Some("ARABIC LETTER MARK"),
1308 '\u{115F}' => Some("HANGUL CHOSEONG FILLER"),
1309 '\u{1160}' => Some("HANGUL JUNGSEONG FILLER"),
1310 '\u{17B4}' => Some("KHMER VOWEL INHERENT AQ"),
1311 '\u{17B5}' => Some("KHMER VOWEL INHERENT AA"),
1312 '\u{180E}' => Some("MONGOLIAN VOWEL SEPARATOR"),
1313
1314 // BOM and special
1315 '\u{FEFF}' => Some("BYTE ORDER MARK"),
1316 '\u{FFFE}' => Some("NONCHARACTER"),
1317 '\u{FFFF}' => Some("NONCHARACTER"),
1318
1319 // Interlinear annotation
1320 '\u{FFF9}' => Some("INTERLINEAR ANNOTATION ANCHOR"),
1321 '\u{FFFA}' => Some("INTERLINEAR ANNOTATION SEPARATOR"),
1322 '\u{FFFB}' => Some("INTERLINEAR ANNOTATION TERMINATOR"),
1323
1324 // Tag characters (U+E0000 - U+E007F)
1325 '\u{E0001}' => Some("LANGUAGE TAG"),
1326 '\u{E0020}' => Some("TAG SPACE"),
1327
1328 _ => None,
1329 }
1330}
1331
1332#[cfg(test)]
1333mod name_continue_table_tests {
1334 use super::is_name_continue_byte;
1335
1336 /// Validates that NAME_CONTINUE_TABLE matches the original
1337 /// `is_name_continue_byte` logic for all 256 byte values.
1338 ///
1339 /// This ensures the lookup table is a faithful replacement
1340 /// for `b == b'_' || b.is_ascii_alphanumeric()`.
1341 ///
1342 /// Written by Claude Code, reviewed by a human.
1343 #[test]
1344 fn name_continue_table_matches_spec() {
1345 for i in 0u16..256 {
1346 let b = i as u8;
1347 let expected = b == b'_' || b.is_ascii_alphanumeric();
1348 assert_eq!(
1349 is_name_continue_byte(b),
1350 expected,
1351 "Mismatch at byte {i} (0x{i:02X}): table says {}, \
1352 original logic says {expected}",
1353 is_name_continue_byte(b),
1354 );
1355 }
1356 }
1357}