libgraphql_parser/token_source/str_to_graphql_token_source.rs
1//! A [`GraphQLTokenSource`] that lexes from a `&str` input.
2//!
3//! This lexer implements zero-copy lexing: token values borrow directly from
4//! the source string using `Cow::Borrowed`, avoiding allocations for names,
5//! numbers, and strings.
6//!
7//! # Features
8//!
9//! - **Zero-copy lexing**: Token values borrow from source text when possible
10//! - **Dual column tracking**: Reports both UTF-8 character positions (for
11//! display) and UTF-16 code unit positions (for LSP compatibility)
12//! - **Comment preservation**: GraphQL `#` comments are captured as trivia
13//! - **Error recovery**: Invalid characters emit `Error` tokens, allowing the
14//! lexer to continue and report multiple errors
15//!
16//! # Usage
17//!
18//! ```rust
19//! use libgraphql_parser::token_source::StrGraphQLTokenSource;
20//!
21//! let source = "{ name }";
22//! let lexer = StrGraphQLTokenSource::new(source);
23//! for token in lexer {
24//! println!("{:?}", token.kind);
25//! }
26//! // Output:
27//! // CurlyBraceOpen
28//! // Name(Borrowed("name"))
29//! // CurlyBraceClose
30//! // Eof
31//! ```
32
33use crate::smallvec;
34use crate::token::GraphQLToken;
35use crate::token::GraphQLTokenKind;
36use crate::token::GraphQLTriviaToken;
37use crate::token::GraphQLTriviaTokenVec;
38use crate::GraphQLErrorNote;
39use crate::GraphQLSourceSpan;
40use crate::SourcePosition;
41use std::borrow::Cow;
42use std::path::Path;
43
44/// A [`GraphQLTokenSource`] that lexes from a `&str` input.
45///
46/// This lexer produces [`GraphQLToken`]s with zero-copy string values where
47/// possible. The `'src` lifetime ties token values to the source string.
48///
49/// See module documentation for details.
50pub struct StrGraphQLTokenSource<'src> {
51 /// The full source text being lexed.
52 source: &'src str,
53
54 /// Current byte offset from the start of `source`.
55 ///
56 /// The remaining text to lex is `&source[curr_byte_offset..]`.
57 curr_byte_offset: usize,
58
59 /// Current 0-based line number.
60 curr_line: usize,
61
62 /// Current UTF-8 character column (0-based).
63 ///
64 /// This counts characters, not bytes. For example, "🎉" (4 bytes) advances
65 /// this by 1.
66 curr_col_utf8: usize,
67
68 /// Current UTF-16 code unit column (0-based).
69 ///
70 /// Characters outside the Basic Multilingual Plane (U+10000 and above)
71 /// advance this by 2 (surrogate pair). For example, "🎉" (U+1F389) advances
72 /// this by 2.
73 curr_col_utf16: usize,
74
75 /// Whether the previous character was `\r`.
76 ///
77 /// Used to handle `\r\n` as a single newline: when we see `\r`, we set
78 /// this flag; if the next character is `\n`, we skip it without
79 /// incrementing the line number again.
80 last_char_was_cr: bool,
81
82 /// Trivia (comments, commas) accumulated before the next token.
83 pending_trivia: GraphQLTriviaTokenVec<'src>,
84
85 /// Whether the EOF token has been emitted.
86 finished: bool,
87
88 /// Optional file path for error messages and spans.
89 ///
90 /// When present, this is included in `GraphQLSourceSpan::file_path`.
91 /// Borrowed from the caller to avoid allocation.
92 file_path: Option<&'src Path>,
93}
94
95impl<'src> StrGraphQLTokenSource<'src> {
96 /// Creates a new token source from a string slice.
97 ///
98 /// # Example
99 ///
100 /// ```rust
101 /// # use libgraphql_parser::token_source::StrGraphQLTokenSource;
102 /// let lexer = StrGraphQLTokenSource::new("{ name }");
103 /// ```
104 pub fn new(source: &'src str) -> Self {
105 Self {
106 source,
107 curr_byte_offset: 0,
108 curr_line: 0,
109 curr_col_utf8: 0,
110 curr_col_utf16: 0,
111 last_char_was_cr: false,
112 pending_trivia: smallvec![],
113 finished: false,
114 file_path: None,
115 }
116 }
117
118 /// Creates a new token source with an associated file path.
119 ///
120 /// The file path is included in token spans for error reporting.
121 pub fn with_file_path(source: &'src str, path: &'src Path) -> Self {
122 Self {
123 source,
124 curr_byte_offset: 0,
125 curr_line: 0,
126 curr_col_utf8: 0,
127 curr_col_utf16: 0,
128 last_char_was_cr: false,
129 pending_trivia: smallvec![],
130 finished: false,
131 file_path: Some(path),
132 }
133 }
134
135 // =========================================================================
136 // Position and scanning helpers
137 // =========================================================================
138
139 /// Returns the remaining source text to be lexed.
140 fn remaining(&self) -> &'src str {
141 &self.source[self.curr_byte_offset..]
142 }
143
144 /// Returns the current source position.
145 fn curr_position(&self) -> SourcePosition {
146 SourcePosition::new(
147 self.curr_line,
148 self.curr_col_utf8,
149 Some(self.curr_col_utf16),
150 self.curr_byte_offset,
151 )
152 }
153
154 /// Peeks at the next character without consuming it.
155 ///
156 /// Returns `None` if at end of input.
157 ///
158 /// # Performance (B1 in benchmark-optimizations.md)
159 ///
160 /// This uses direct byte access with an ASCII fast path instead
161 /// of the naive `remaining().chars().next()`. GraphQL source text
162 /// is overwhelmingly ASCII (names, keywords, punctuators,
163 /// whitespace), so the fast path covers >99% of calls. The
164 /// non-ASCII fallback (Unicode in string literals/comments) is
165 /// rare and can remain slow.
166 ///
167 /// Without this optimization, every peek would construct a
168 /// `Chars` iterator and decode the first UTF-8 sequence — a
169 /// measurable cost given that peek is called millions of times
170 /// for large inputs.
171 #[inline]
172 fn peek_char(&self) -> Option<char> {
173 let bytes = self.source.as_bytes();
174 if self.curr_byte_offset >= bytes.len() {
175 return None;
176 }
177 let b = bytes[self.curr_byte_offset];
178 if b.is_ascii() {
179 // Fast path: single-byte ASCII character (covers >99%
180 // of GraphQL source text).
181 Some(b as char)
182 } else {
183 // Slow path: multi-byte UTF-8 character. Fall back to
184 // full UTF-8 decoding. This only triggers inside
185 // string literals or comments containing non-ASCII
186 // characters.
187 self.source[self.curr_byte_offset..].chars().next()
188 }
189 }
190
191 /// Peeks at the nth character ahead without consuming.
192 ///
193 /// `peek_char_nth(0)` is equivalent to `peek_char()`.
194 /// Returns `None` if there aren't enough characters remaining.
195 ///
196 /// Note: Unlike `peek_char()`, this still uses the iterator
197 /// approach since it needs to skip over variable-width UTF-8
198 /// characters to reach position n. This method is only called
199 /// in a few places for multi-character lookahead (e.g., number
200 /// parsing to check digit after `.`), so it is not a hot path.
201 fn peek_char_nth(&self, n: usize) -> Option<char> {
202 self.remaining().chars().nth(n)
203 }
204
205 /// Consumes the next character and updates position tracking.
206 ///
207 /// Returns `None` if at end of input.
208 ///
209 /// This method handles:
210 /// - Advancing byte offset by the character's UTF-8 length
211 /// - Incrementing line number on newlines (`\n`, `\r`, `\r\n`)
212 /// - Tracking UTF-8 character column and UTF-16 code unit column
213 ///
214 /// # Performance (B1 in benchmark-optimizations.md)
215 ///
216 /// Uses an ASCII fast path: if the current byte is <0x80, we
217 /// know it is exactly 1 byte, 1 UTF-8 column, and 1 UTF-16
218 /// code unit, so we avoid calling `ch.len_utf8()` and
219 /// `ch.len_utf16()`. The slow path handles multi-byte UTF-8
220 /// sequences.
221 fn consume(&mut self) -> Option<char> {
222 let bytes = self.source.as_bytes();
223 if self.curr_byte_offset >= bytes.len() {
224 return None;
225 }
226
227 let b = bytes[self.curr_byte_offset];
228
229 if b.is_ascii() {
230 // ASCII fast path: 1 byte, 1 UTF-8 col, 1 UTF-16 unit
231 let ch = b as char;
232
233 if ch == '\n' {
234 if self.last_char_was_cr {
235 self.last_char_was_cr = false;
236 } else {
237 self.curr_line += 1;
238 self.curr_col_utf8 = 0;
239 self.curr_col_utf16 = 0;
240 }
241 } else if ch == '\r' {
242 self.curr_line += 1;
243 self.curr_col_utf8 = 0;
244 self.curr_col_utf16 = 0;
245 self.last_char_was_cr = true;
246 } else {
247 self.curr_col_utf8 += 1;
248 self.curr_col_utf16 += 1;
249 self.last_char_was_cr = false;
250 }
251
252 self.curr_byte_offset += 1;
253 Some(ch)
254 } else {
255 // Multi-byte UTF-8 character (non-ASCII). This only
256 // occurs inside string literals or comments containing
257 // Unicode characters. We fall back to full char
258 // decoding to get the correct byte length and UTF-16
259 // length.
260 let ch = self.source[self.curr_byte_offset..]
261 .chars()
262 .next()
263 .unwrap();
264 let byte_len = ch.len_utf8();
265
266 // Non-ASCII characters are never newlines, so always
267 // advance columns.
268 self.curr_col_utf8 += 1;
269 self.curr_col_utf16 += ch.len_utf16();
270 self.last_char_was_cr = false;
271
272 self.curr_byte_offset += byte_len;
273 Some(ch)
274 }
275 }
276
277 /// Creates a `GraphQLSourceSpan` from a start position to the current
278 /// position.
279 fn make_span(&self, start: SourcePosition) -> GraphQLSourceSpan {
280 let end = self.curr_position();
281 if let Some(path) = self.file_path {
282 GraphQLSourceSpan::with_file(start, end, path.to_path_buf())
283 } else {
284 GraphQLSourceSpan::new(start, end)
285 }
286 }
287
288 // =========================================================================
289 // Token creation helpers
290 // =========================================================================
291
292 /// Creates a token with the accumulated trivia.
293 fn make_token(
294 &mut self,
295 kind: GraphQLTokenKind<'src>,
296 span: GraphQLSourceSpan,
297 ) -> GraphQLToken<'src> {
298 GraphQLToken {
299 kind,
300 preceding_trivia: std::mem::take(&mut self.pending_trivia),
301 span,
302 }
303 }
304
305 // =========================================================================
306 // Lexer main loop
307 // =========================================================================
308
309 /// Advances to the next token, skipping whitespace and collecting trivia.
310 fn next_token(&mut self) -> GraphQLToken<'src> {
311 loop {
312 // Skip whitespace
313 self.skip_whitespace();
314
315 let start = self.curr_position();
316
317 match self.peek_char() {
318 None => {
319 // End of input
320 let span = self.make_span(start);
321 return self.make_token(GraphQLTokenKind::Eof, span);
322 }
323
324 Some('#') => {
325 // Comment - collect as trivia and continue
326 self.lex_comment(start);
327 continue;
328 }
329
330 Some(',') => {
331 // Comma - collect as trivia and continue
332 self.consume();
333 let span = self.make_span(start);
334 self.pending_trivia
335 .push(GraphQLTriviaToken::Comma { span });
336 continue;
337 }
338
339 // Single-character punctuators
340 Some('!') => {
341 self.consume();
342 let span = self.make_span(start);
343 return self.make_token(GraphQLTokenKind::Bang, span);
344 }
345 Some('$') => {
346 self.consume();
347 let span = self.make_span(start);
348 return self.make_token(GraphQLTokenKind::Dollar, span);
349 }
350 Some('&') => {
351 self.consume();
352 let span = self.make_span(start);
353 return self.make_token(GraphQLTokenKind::Ampersand, span);
354 }
355 Some('(') => {
356 self.consume();
357 let span = self.make_span(start);
358 return self.make_token(GraphQLTokenKind::ParenOpen, span);
359 }
360 Some(')') => {
361 self.consume();
362 let span = self.make_span(start);
363 return self.make_token(GraphQLTokenKind::ParenClose, span);
364 }
365 Some(':') => {
366 self.consume();
367 let span = self.make_span(start);
368 return self.make_token(GraphQLTokenKind::Colon, span);
369 }
370 Some('=') => {
371 self.consume();
372 let span = self.make_span(start);
373 return self.make_token(GraphQLTokenKind::Equals, span);
374 }
375 Some('@') => {
376 self.consume();
377 let span = self.make_span(start);
378 return self.make_token(GraphQLTokenKind::At, span);
379 }
380 Some('[') => {
381 self.consume();
382 let span = self.make_span(start);
383 return self.make_token(GraphQLTokenKind::SquareBracketOpen, span);
384 }
385 Some(']') => {
386 self.consume();
387 let span = self.make_span(start);
388 return self.make_token(GraphQLTokenKind::SquareBracketClose, span);
389 }
390 Some('{') => {
391 self.consume();
392 let span = self.make_span(start);
393 return self.make_token(GraphQLTokenKind::CurlyBraceOpen, span);
394 }
395 Some('}') => {
396 self.consume();
397 let span = self.make_span(start);
398 return self.make_token(GraphQLTokenKind::CurlyBraceClose, span);
399 }
400 Some('|') => {
401 self.consume();
402 let span = self.make_span(start);
403 return self.make_token(GraphQLTokenKind::Pipe, span);
404 }
405
406 // Ellipsis or dot error
407 Some('.') => {
408 return self.lex_dot_or_ellipsis(start);
409 }
410
411 // String literals
412 Some('"') => {
413 return self.lex_string(start);
414 }
415
416 // Names and keywords
417 Some(c) if is_name_start(c) => {
418 return self.lex_name(start);
419 }
420
421 // Numbers (including negative)
422 Some(c) if c == '-' || c.is_ascii_digit() => {
423 return self.lex_number(start);
424 }
425
426 // Invalid character
427 Some(_) => {
428 return self.lex_invalid_character(start);
429 }
430 }
431 }
432 }
433
434 // =========================================================================
435 // Whitespace handling
436 // =========================================================================
437
438 /// Skips whitespace characters.
439 ///
440 /// Per the GraphQL spec, these are "ignored tokens":
441 /// - Space (U+0020)
442 /// - Tab (U+0009)
443 /// - Line terminators: LF (U+000A), CR (U+000D), CRLF
444 /// - BOM (U+FEFF) - Unicode BOM is ignored anywhere in the document
445 ///
446 /// See: <https://spec.graphql.org/September2025/#sec-Language.Source-Text.Unicode>
447 ///
448 /// Note: Comma is also whitespace in GraphQL but we handle it separately
449 /// to preserve it as trivia.
450 ///
451 /// # Performance (B2 in benchmark-optimizations.md)
452 ///
453 /// Uses byte-scanning instead of per-character `consume()`
454 /// calls. Each `consume()` does 5-6 field updates (peek,
455 /// newline check, col_utf8, col_utf16, last_char_was_cr,
456 /// byte_offset). Byte scanning does one branch per byte and
457 /// batch-updates position state once at the end.
458 ///
459 /// Without this optimization, skipping 4 spaces (typical
460 /// indentation) would do ~24 field updates. With byte
461 /// scanning: 4 byte comparisons + ~5 batch updates.
462 fn skip_whitespace(&mut self) {
463 let bytes = self.source.as_bytes();
464 let mut i = self.curr_byte_offset;
465 let mut last_newline_byte_pos: Option<usize> = None;
466 let mut lines_added: usize = 0;
467 let mut last_was_cr = self.last_char_was_cr;
468 // Track BOM count since last newline so we can compute
469 // character columns (BOM is 3 bytes but 1 column).
470 let mut bom_after_last_nl: usize = 0;
471
472 loop {
473 if i >= bytes.len() {
474 break;
475 }
476 match bytes[i] {
477 b' ' | b'\t' => {
478 last_was_cr = false;
479 i += 1;
480 },
481 b'\n' => {
482 if !last_was_cr {
483 lines_added += 1;
484 }
485 last_was_cr = false;
486 last_newline_byte_pos = Some(i);
487 bom_after_last_nl = 0;
488 i += 1;
489 },
490 b'\r' => {
491 lines_added += 1;
492 last_was_cr = true;
493 last_newline_byte_pos = Some(i);
494 bom_after_last_nl = 0;
495 i += 1;
496 },
497 // BOM: U+FEFF = 0xEF 0xBB 0xBF in UTF-8.
498 // Rare in practice but must be handled correctly.
499 0xEF if i + 2 < bytes.len()
500 && bytes[i + 1] == 0xBB
501 && bytes[i + 2] == 0xBF => {
502 last_was_cr = false;
503 bom_after_last_nl += 1;
504 i += 3;
505 },
506 _ => break,
507 }
508 }
509
510 if i == self.curr_byte_offset {
511 return;
512 }
513
514 // Batch-update position state.
515 self.curr_line += lines_added;
516 self.last_char_was_cr = last_was_cr;
517
518 if let Some(nl_pos) = last_newline_byte_pos {
519 // Column resets after a newline. Count characters
520 // from after the last newline to the current
521 // position. For ASCII whitespace, bytes = chars.
522 // Each BOM contributes 3 bytes but only 1 column.
523 let bytes_after_nl = i - (nl_pos + 1);
524 let col = bytes_after_nl - bom_after_last_nl * 2;
525 self.curr_col_utf8 = col;
526 self.curr_col_utf16 = col;
527 } else {
528 // No newlines in this whitespace run — advance
529 // columns from current position. Each BOM
530 // contributes 3 bytes but only 1 column.
531 let consumed_bytes = i - self.curr_byte_offset;
532 let col_advance =
533 consumed_bytes - bom_after_last_nl * 2;
534 self.curr_col_utf8 += col_advance;
535 self.curr_col_utf16 += col_advance;
536 }
537
538 self.curr_byte_offset = i;
539 }
540
541 // =========================================================================
542 // Comment lexing
543 // =========================================================================
544
545 /// Lexes a comment and adds it to pending trivia.
546 ///
547 /// A comment starts with `#` and extends to the end of the line.
548 ///
549 /// # Performance (B2 in benchmark-optimizations.md)
550 ///
551 /// Uses byte-scanning to find end-of-line instead of
552 /// per-character `peek_char()` + `consume()`. Comments never
553 /// span multiple lines, so line number doesn't change — only
554 /// the column advances. Column is computed once at the end
555 /// via `compute_columns_for_span()` (with an ASCII fast path
556 /// for the common case).
557 fn lex_comment(&mut self, start: SourcePosition) {
558 // Consume the '#' (single ASCII byte).
559 self.curr_byte_offset += 1;
560 self.curr_col_utf8 += 1;
561 self.curr_col_utf16 += 1;
562 self.last_char_was_cr = false;
563
564 let content_start = self.curr_byte_offset;
565 let bytes = self.source.as_bytes();
566
567 // Byte-scan to end of line or EOF. All line-ending bytes
568 // (\n = 0x0A, \r = 0x0D) are ASCII, so they can never
569 // appear as continuation bytes in multi-byte UTF-8
570 // sequences. This makes byte-scanning safe even when the
571 // comment contains Unicode characters.
572 let mut i = content_start;
573 while i < bytes.len()
574 && bytes[i] != b'\n'
575 && bytes[i] != b'\r' {
576 i += 1;
577 }
578
579 // Batch-update column for the comment content.
580 // Comments are single-line, so only column advances.
581 let (col_utf8, col_utf16) =
582 compute_columns_for_span(
583 &self.source[content_start..i],
584 );
585 self.curr_col_utf8 += col_utf8;
586 self.curr_col_utf16 += col_utf16;
587 self.curr_byte_offset = i;
588
589 let content = &self.source[content_start..i];
590 let span = self.make_span(start);
591
592 self.pending_trivia.push(GraphQLTriviaToken::Comment {
593 value: Cow::Borrowed(content),
594 span,
595 });
596 }
597
598 // =========================================================================
599 // Dot / Ellipsis lexing
600 // =========================================================================
601
602 /// Lexes dots, producing either an Ellipsis token or an error.
603 ///
604 /// This implements a state machine for dot handling similar to
605 /// `RustMacroGraphQLTokenSource`:
606 /// - `...` (adjacent) → `Ellipsis`
607 /// - `.` alone → Error (no hint - could be many things like `Foo.Bar`)
608 /// - `..` (adjacent) → Error with help to add third dot
609 /// - `. .` (spaced, same line) → Error with help about spacing
610 /// - `.. .` (first two adjacent, third spaced) → Error with help about
611 /// spacing
612 /// - `. ..` (first spaced, last two adjacent) → Error with help about
613 /// spacing
614 /// - `. . .` (all spaced, same line) → Error with help about spacing
615 /// - Dots on different lines → Separate errors
616 ///
617 /// TODO: Look for patterns like `{Name}.{Name}` and give a useful error
618 /// hint (e.g., user may have been trying to use enum syntax incorrectly).
619 fn lex_dot_or_ellipsis(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
620 let first_dot_line = self.curr_line;
621
622 // Consume first dot
623 self.consume();
624
625 // Check for second dot (may be adjacent or spaced)
626 self.skip_whitespace_same_line();
627
628 match self.peek_char() {
629 Some('.') if self.curr_line == first_dot_line => {
630 let second_dot_start = self.curr_position();
631 let first_two_adjacent = second_dot_start.byte_offset() == start.byte_offset() + 1;
632 self.consume();
633
634 // Check for third dot
635 self.skip_whitespace_same_line();
636
637 match self.peek_char() {
638 Some('.') if self.curr_line == first_dot_line => {
639 let third_dot_start = self.curr_position();
640 self.consume();
641 let span = self.make_span(start);
642
643 // Check if all three dots were adjacent (no whitespace)
644 let second_third_adjacent =
645 third_dot_start.byte_offset() == second_dot_start.byte_offset() + 1;
646
647 if first_two_adjacent && second_third_adjacent {
648 // All adjacent - valid ellipsis
649 self.make_token(GraphQLTokenKind::Ellipsis, span)
650 } else if first_two_adjacent {
651 // `.. .` - first two adjacent, third spaced
652 let kind = GraphQLTokenKind::Error {
653 message: "Unexpected `.. .`".to_string(),
654 error_notes: smallvec![GraphQLErrorNote::help(
655 "This `.` may have been intended to complete a `...` spread \
656 operator. Try removing the extra spacing between the dots."
657 )],
658 };
659 self.make_token(kind, span)
660 } else if second_third_adjacent {
661 // `. ..` - first spaced, last two adjacent
662 let kind = GraphQLTokenKind::Error {
663 message: "Unexpected `. ..`".to_string(),
664 error_notes: smallvec![GraphQLErrorNote::help(
665 "These dots may have been intended to form a `...` spread \
666 operator. Try removing the extra spacing between the dots."
667 )],
668 };
669 self.make_token(kind, span)
670 } else {
671 // `. . .` - all spaced
672 let kind = GraphQLTokenKind::Error {
673 message: "Unexpected `. . .`".to_string(),
674 error_notes: smallvec![GraphQLErrorNote::help(
675 "These dots may have been intended to form a `...` spread \
676 operator. Try removing the extra spacing between the dots."
677 )],
678 };
679 self.make_token(kind, span)
680 }
681 }
682 _ => {
683 // Only two dots found on this line
684 let span = self.make_span(start);
685
686 if first_two_adjacent {
687 // Adjacent `..` - suggest adding third dot
688 let kind = GraphQLTokenKind::Error {
689 message: "Unexpected `..` (use `...` for spread operator)"
690 .to_string(),
691 error_notes: smallvec![GraphQLErrorNote::help(
692 "Add one more `.` to form the spread operator `...`"
693 )],
694 };
695 self.make_token(kind, span)
696 } else {
697 // Spaced `. .` - suggest removing spacing
698 let kind = GraphQLTokenKind::Error {
699 message: "Unexpected `. .` (use `...` for spread operator)"
700 .to_string(),
701 error_notes: smallvec![GraphQLErrorNote::help(
702 "These dots may have been intended to form a `...` spread \
703 operator. Try removing the extra spacing between the dots."
704 )],
705 };
706 self.make_token(kind, span)
707 }
708 }
709 }
710 }
711 _ => {
712 // Single dot (or dots on different lines)
713 // Don't assume it was meant to be ellipsis - could be `Foo.Bar` style
714 let span = self.make_span(start);
715 let kind = GraphQLTokenKind::Error {
716 message: "Unexpected `.`".to_string(),
717 error_notes: smallvec![],
718 };
719 self.make_token(kind, span)
720 }
721 }
722 }
723
724 /// Skips whitespace but only on the same line.
725 ///
726 /// Used for dot consolidation - we only merge dots that are on the same
727 /// line.
728 fn skip_whitespace_same_line(&mut self) {
729 while let Some(ch) = self.peek_char() {
730 match ch {
731 ' ' | '\t' | '\u{FEFF}' => {
732 self.consume();
733 }
734 _ => break,
735 }
736 }
737 }
738
739 // =========================================================================
740 // Name lexing
741 // =========================================================================
742
743 /// Lexes a name or keyword.
744 ///
745 /// Names match the pattern: `/[_A-Za-z][_0-9A-Za-z]*/`
746 ///
747 /// Keywords `true`, `false`, and `null` are emitted as distinct token
748 /// kinds.
749 ///
750 /// # Performance (B2 in benchmark-optimizations.md)
751 ///
752 /// Uses byte-scanning to find the end of the name in a tight
753 /// loop (one byte comparison per iteration), then updates
754 /// position tracking once for the entire name. This avoids
755 /// calling `consume()` per character, which would do 5-6 field
756 /// updates per character (peek, newline check, col_utf8,
757 /// col_utf16, last_char_was_cr, byte_offset).
758 ///
759 /// This is safe because GraphQL names are ASCII-only by spec
760 /// (`[_A-Za-z][_0-9A-Za-z]*`) and never contain newlines, so:
761 /// - Every byte is exactly one character
762 /// - Column advances by the number of bytes consumed
763 /// - Line number never changes
764 /// - `last_char_was_cr` is always cleared
765 fn lex_name(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
766 let name_start = self.curr_byte_offset;
767 let bytes = self.source.as_bytes();
768
769 // Byte-scan: skip first char (already validated as name
770 // start) and continue while bytes match [_0-9A-Za-z].
771 let mut i = name_start + 1;
772 while i < bytes.len() && is_name_continue_byte(bytes[i]) {
773 i += 1;
774 }
775
776 let name_len = i - name_start;
777
778 // Batch-update position: names are ASCII-only, no
779 // newlines, so column advances by name length and line
780 // stays the same.
781 self.curr_byte_offset = i;
782 self.curr_col_utf8 += name_len;
783 self.curr_col_utf16 += name_len;
784 self.last_char_was_cr = false;
785
786 let name = &self.source[name_start..i];
787 let span = self.make_span(start);
788
789 // Check for keywords
790 let kind = match name {
791 "true" => GraphQLTokenKind::True,
792 "false" => GraphQLTokenKind::False,
793 "null" => GraphQLTokenKind::Null,
794 _ => GraphQLTokenKind::name_borrowed(name),
795 };
796
797 self.make_token(kind, span)
798 }
799
800 // =========================================================================
801 // Number lexing
802 // =========================================================================
803
804 /// Lexes an integer or float literal.
805 ///
806 /// Handles:
807 /// - Optional negative sign: `-`
808 /// - Integer part: `0` or `[1-9][0-9]*`
809 /// - Optional decimal part: `.[0-9]+`
810 /// - Optional exponent: `[eE][+-]?[0-9]+`
811 fn lex_number(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
812 let num_start = self.curr_byte_offset;
813 let mut is_float = false;
814
815 // Optional negative sign
816 if self.peek_char() == Some('-') {
817 self.consume();
818 }
819
820 // Integer part
821 match self.peek_char() {
822 Some('0') => {
823 self.consume();
824 // Check for invalid leading zeros (e.g., 00, 01)
825 if let Some(ch) = self.peek_char()
826 && ch.is_ascii_digit() {
827 // Invalid: leading zeros
828 return self.lex_number_error(
829 start,
830 num_start,
831 "Invalid number: leading zeros are not allowed",
832 Some("https://spec.graphql.org/September2025/#sec-Int-Value"),
833 );
834 }
835 }
836 Some(ch) if ch.is_ascii_digit() => {
837 // Non-zero start
838 self.consume();
839 while let Some(ch) = self.peek_char() {
840 if ch.is_ascii_digit() {
841 self.consume();
842 } else {
843 break;
844 }
845 }
846 }
847 Some(_) | None => {
848 // Just a `-` with no digits
849 let span = self.make_span(start);
850 let kind = GraphQLTokenKind::Error {
851 message: "Unexpected `-`".to_string(),
852 error_notes: smallvec![],
853 };
854 return self.make_token(kind, span);
855 }
856 }
857
858 // Optional decimal part
859 if self.peek_char() == Some('.') {
860 // Check that the next character is a digit (not another dot for `...`)
861 if let Some(ch) = self.peek_char_nth(1)
862 && ch.is_ascii_digit() {
863 is_float = true;
864 self.consume(); // consume the '.'
865
866 // Consume decimal digits
867 while let Some(ch) = self.peek_char() {
868 if ch.is_ascii_digit() {
869 self.consume();
870 } else {
871 break;
872 }
873 }
874 }
875 }
876
877 // Optional exponent part
878 if let Some(ch) = self.peek_char()
879 && (ch == 'e' || ch == 'E') {
880 is_float = true;
881 self.consume();
882
883 // Optional sign
884 if let Some(ch) = self.peek_char()
885 && (ch == '+' || ch == '-') {
886 self.consume();
887 }
888
889 // Exponent digits (required)
890 let has_exponent_digits = matches!(self.peek_char(), Some(ch) if ch.is_ascii_digit());
891 if !has_exponent_digits {
892 return self.lex_number_error(
893 start,
894 num_start,
895 "Invalid number: exponent must have at least one digit",
896 Some("https://spec.graphql.org/September2025/#sec-Float-Value"),
897 );
898 }
899
900 while let Some(ch) = self.peek_char() {
901 if ch.is_ascii_digit() {
902 self.consume();
903 } else {
904 break;
905 }
906 }
907 }
908
909 let num_end = self.curr_byte_offset;
910 let num_text = &self.source[num_start..num_end];
911 let span = self.make_span(start);
912
913 let kind = if is_float {
914 GraphQLTokenKind::float_value_borrowed(num_text)
915 } else {
916 GraphQLTokenKind::int_value_borrowed(num_text)
917 };
918
919 self.make_token(kind, span)
920 }
921
922 /// Creates an error token for an invalid number.
923 fn lex_number_error(
924 &mut self,
925 start: SourcePosition,
926 num_start: usize,
927 message: &str,
928 spec_url: Option<&str>,
929 ) -> GraphQLToken<'src> {
930 // Consume remaining number-like characters to provide better error recovery
931 while let Some(ch) = self.peek_char() {
932 if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' {
933 self.consume();
934 } else {
935 break;
936 }
937 }
938
939 let num_end = self.curr_byte_offset;
940 let invalid_text = &self.source[num_start..num_end];
941 let span = self.make_span(start);
942
943 let mut error_notes = smallvec![];
944 if let Some(url) = spec_url {
945 error_notes.push(GraphQLErrorNote::spec(url));
946 }
947
948 let kind = GraphQLTokenKind::Error {
949 message: format!("{message}: `{invalid_text}`"),
950 error_notes,
951 };
952
953 self.make_token(kind, span)
954 }
955
956 // =========================================================================
957 // String lexing
958 // =========================================================================
959
960 /// Lexes a string literal (single-line or block string).
961 fn lex_string(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
962 let str_start = self.curr_byte_offset;
963
964 // Check for block string (""")
965 if self.remaining().starts_with("\"\"\"") {
966 return self.lex_block_string(start, str_start);
967 }
968
969 // Single-line string
970 self.consume(); // consume opening "
971
972 loop {
973 match self.peek_char() {
974 None => {
975 // Unterminated string
976 let span = self.make_span(start.clone());
977 let kind = GraphQLTokenKind::Error {
978 message: "Unterminated string literal".to_string(),
979 error_notes: smallvec![
980 GraphQLErrorNote::general_with_span(
981 "String started here",
982 self.make_span(start),
983 ),
984 GraphQLErrorNote::help("Add closing `\"`"),
985 ],
986 };
987 return self.make_token(kind, span);
988 }
989 Some('\n') | Some('\r') => {
990 // Unescaped newline in single-line string - consume it so
991 // the span includes the newline character
992 self.consume();
993 // Also consume \n if this was \r\n
994 if self.last_char_was_cr && self.peek_char() == Some('\n') {
995 self.consume();
996 }
997 let span = self.make_span(start);
998 let kind = GraphQLTokenKind::Error {
999 message: "Unterminated string literal".to_string(),
1000 error_notes: smallvec![
1001 GraphQLErrorNote::general(
1002 "Single-line strings cannot contain unescaped newlines"
1003 ),
1004 GraphQLErrorNote::help("Use a block string (triple quotes) for multi-line strings, or escape the newline with `\\n`"),
1005 ],
1006 };
1007 return self.make_token(kind, span);
1008 }
1009 Some('"') => {
1010 // End of string
1011 self.consume();
1012 break;
1013 }
1014 Some('\\') => {
1015 // Escape sequence - consume backslash and next character
1016 self.consume();
1017 if self.peek_char().is_some() {
1018 self.consume();
1019 }
1020 }
1021 Some(_) => {
1022 self.consume();
1023 }
1024 }
1025 }
1026
1027 let str_end = self.curr_byte_offset;
1028 let string_text = &self.source[str_start..str_end];
1029 let span = self.make_span(start);
1030
1031 self.make_token(GraphQLTokenKind::string_value_borrowed(string_text), span)
1032 }
1033
1034 /// Lexes a block string literal.
1035 ///
1036 /// # Performance (B2 in benchmark-optimizations.md)
1037 ///
1038 /// Uses byte-scanning instead of per-character
1039 /// `peek_char()`/`consume()` calls. The scan loop checks
1040 /// each byte against the special characters (`"`, `\`, `\n`,
1041 /// `\r`) and skips everything else with a single `i += 1`.
1042 /// Position is batch-updated once at the end.
1043 ///
1044 /// This is safe for multi-byte UTF-8 content because the
1045 /// sentinel bytes (`"` = 0x22, `\` = 0x5C, `\n` = 0x0A,
1046 /// `\r` = 0x0D) are all ASCII (<0x80) and can never appear
1047 /// as continuation bytes in multi-byte UTF-8 sequences
1048 /// (which are always >=0x80).
1049 fn lex_block_string(
1050 &mut self,
1051 start: SourcePosition,
1052 str_start: usize,
1053 ) -> GraphQLToken<'src> {
1054 let bytes = self.source.as_bytes();
1055 let scan_start = self.curr_byte_offset;
1056
1057 // Skip opening """ (3 ASCII bytes, caller verified).
1058 let mut i = scan_start + 3;
1059 let mut lines_added: usize = 0;
1060 let mut last_newline_byte_pos: Option<usize> = None;
1061 let mut last_was_cr = false;
1062
1063 let found_close = loop {
1064 if i >= bytes.len() {
1065 break false;
1066 }
1067
1068 match bytes[i] {
1069 b'"' if i + 2 < bytes.len()
1070 && bytes[i + 1] == b'"'
1071 && bytes[i + 2] == b'"' =>
1072 {
1073 // Closing """.
1074 i += 3;
1075 last_was_cr = false;
1076 break true;
1077 },
1078 b'\\' if i + 3 < bytes.len()
1079 && bytes[i + 1] == b'"'
1080 && bytes[i + 2] == b'"'
1081 && bytes[i + 3] == b'"' =>
1082 {
1083 // Escaped triple quote \""".
1084 last_was_cr = false;
1085 i += 4;
1086 },
1087 b'\n' => {
1088 if !last_was_cr {
1089 lines_added += 1;
1090 }
1091 last_was_cr = false;
1092 last_newline_byte_pos = Some(i);
1093 i += 1;
1094 },
1095 b'\r' => {
1096 lines_added += 1;
1097 last_was_cr = true;
1098 last_newline_byte_pos = Some(i);
1099 i += 1;
1100 },
1101 _ => {
1102 last_was_cr = false;
1103 i += 1;
1104 },
1105 }
1106 };
1107
1108 // Batch-update position state.
1109 self.curr_line += lines_added;
1110 self.last_char_was_cr = last_was_cr;
1111
1112 if let Some(nl_pos) = last_newline_byte_pos {
1113 // Column resets after the last newline.
1114 let (col_utf8, col_utf16) =
1115 compute_columns_for_span(
1116 &self.source[nl_pos + 1..i],
1117 );
1118 self.curr_col_utf8 = col_utf8;
1119 self.curr_col_utf16 = col_utf16;
1120 } else {
1121 // No newlines — advance columns from current
1122 // position.
1123 let (col_utf8, col_utf16) =
1124 compute_columns_for_span(
1125 &self.source
1126 [self.curr_byte_offset..i],
1127 );
1128 self.curr_col_utf8 += col_utf8;
1129 self.curr_col_utf16 += col_utf16;
1130 }
1131
1132 self.curr_byte_offset = i;
1133
1134 if !found_close {
1135 // Unterminated block string.
1136 let span = self.make_span(start.clone());
1137 let kind = GraphQLTokenKind::Error {
1138 message: "Unterminated block string"
1139 .to_string(),
1140 error_notes: smallvec![
1141 GraphQLErrorNote::general_with_span(
1142 "Block string started here",
1143 self.make_span(start),
1144 ),
1145 GraphQLErrorNote::help(
1146 "Add closing `\"\"\"`",
1147 ),
1148 ],
1149 };
1150 return self.make_token(kind, span);
1151 }
1152
1153 let str_end = self.curr_byte_offset;
1154 let string_text = &self.source[str_start..str_end];
1155 let span = self.make_span(start);
1156
1157 self.make_token(
1158 GraphQLTokenKind::string_value_borrowed(
1159 string_text,
1160 ),
1161 span,
1162 )
1163 }
1164
1165 // =========================================================================
1166 // Invalid character handling
1167 // =========================================================================
1168
1169 /// Lexes an invalid character, producing an error token.
1170 fn lex_invalid_character(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
1171 let ch = self.consume().unwrap();
1172 let span = self.make_span(start);
1173
1174 let kind = GraphQLTokenKind::Error {
1175 message: format!("Unexpected character {}", describe_char(ch)),
1176 error_notes: smallvec![],
1177 };
1178
1179 self.make_token(kind, span)
1180 }
1181}
1182
1183// =============================================================================
1184// Iterator implementation
1185// =============================================================================
1186
1187impl<'src> Iterator for StrGraphQLTokenSource<'src> {
1188 type Item = GraphQLToken<'src>;
1189
1190 fn next(&mut self) -> Option<Self::Item> {
1191 if self.finished {
1192 return None;
1193 }
1194
1195 let token = self.next_token();
1196
1197 if matches!(token.kind, GraphQLTokenKind::Eof) {
1198 self.finished = true;
1199 }
1200
1201 Some(token)
1202 }
1203}
1204
1205// =============================================================================
1206// Helper functions
1207// =============================================================================
1208
1209/// Returns `true` if `ch` can start a GraphQL name.
1210///
1211/// Per the GraphQL spec, names start with `NameStart`:
1212/// <https://spec.graphql.org/September2025/#NameStart>
1213fn is_name_start(ch: char) -> bool {
1214 ch == '_' || ch.is_ascii_alphabetic()
1215}
1216
1217/// Returns `true` if `b` can continue a GraphQL name.
1218///
1219/// Per the GraphQL spec, names continue with `NameContinue`:
1220/// <https://spec.graphql.org/September2025/#NameContinue>
1221///
1222/// Byte-level check for use in byte-scanning fast paths (see B2
1223/// in benchmark-optimizations.md). Non-ASCII bytes (>=0x80)
1224/// always return false, which is correct since GraphQL names are
1225/// ASCII-only by spec.
1226#[inline]
1227fn is_name_continue_byte(b: u8) -> bool {
1228 b == b'_' || b.is_ascii_alphanumeric()
1229}
1230
1231/// Computes (utf8_char_count, utf16_code_unit_count) for a
1232/// string slice. Used by byte-scanning fast paths to batch-
1233/// compute column advancement after scanning a range.
1234///
1235/// Has an ASCII fast path: when all bytes are ASCII (the common
1236/// case for GraphQL source text), the byte count equals both the
1237/// UTF-8 char count and the UTF-16 code unit count, so no
1238/// per-character iteration is needed.
1239///
1240/// See B2 in benchmark-optimizations.md.
1241fn compute_columns_for_span(s: &str) -> (usize, usize) {
1242 if s.is_ascii() {
1243 let len = s.len();
1244 (len, len)
1245 } else {
1246 let mut utf8_col: usize = 0;
1247 let mut utf16_col: usize = 0;
1248 for ch in s.chars() {
1249 utf8_col += 1;
1250 utf16_col += ch.len_utf16();
1251 }
1252 (utf8_col, utf16_col)
1253 }
1254}
1255
1256/// Returns a human-readable description of a character for error messages.
1257///
1258/// For printable characters, returns the character in backticks.
1259/// For invisible/control characters, includes Unicode code point description.
1260fn describe_char(ch: char) -> String {
1261 if ch.is_control() || (ch.is_whitespace() && ch != ' ') {
1262 // Invisible characters get detailed description
1263 let name = unicode_char_name(ch);
1264 if let Some(name) = name {
1265 format!("`{}` (U+{:04X}: {})", ch, ch as u32, name)
1266 } else {
1267 format!("`{}` (U+{:04X})", ch, ch as u32)
1268 }
1269 } else {
1270 format!("`{ch}`")
1271 }
1272}
1273
1274/// Returns the Unicode name for well-known invisible/control characters.
1275///
1276/// This provides meaningful names for commonly encountered invisible
1277/// characters. Returns `None` for characters without a known name.
1278fn unicode_char_name(ch: char) -> Option<&'static str> {
1279 match ch {
1280 // C0 control characters (U+0000 - U+001F)
1281 '\u{0000}' => Some("NULL"),
1282 '\u{0001}' => Some("START OF HEADING"),
1283 '\u{0002}' => Some("START OF TEXT"),
1284 '\u{0003}' => Some("END OF TEXT"),
1285 '\u{0004}' => Some("END OF TRANSMISSION"),
1286 '\u{0005}' => Some("ENQUIRY"),
1287 '\u{0006}' => Some("ACKNOWLEDGE"),
1288 '\u{0007}' => Some("BELL"),
1289 '\u{0008}' => Some("BACKSPACE"),
1290 '\u{0009}' => Some("HORIZONTAL TAB"),
1291 '\u{000A}' => Some("LINE FEED"),
1292 '\u{000B}' => Some("VERTICAL TAB"),
1293 '\u{000C}' => Some("FORM FEED"),
1294 '\u{000D}' => Some("CARRIAGE RETURN"),
1295 '\u{000E}' => Some("SHIFT OUT"),
1296 '\u{000F}' => Some("SHIFT IN"),
1297 '\u{0010}' => Some("DATA LINK ESCAPE"),
1298 '\u{0011}' => Some("DEVICE CONTROL ONE"),
1299 '\u{0012}' => Some("DEVICE CONTROL TWO"),
1300 '\u{0013}' => Some("DEVICE CONTROL THREE"),
1301 '\u{0014}' => Some("DEVICE CONTROL FOUR"),
1302 '\u{0015}' => Some("NEGATIVE ACKNOWLEDGE"),
1303 '\u{0016}' => Some("SYNCHRONOUS IDLE"),
1304 '\u{0017}' => Some("END OF TRANSMISSION BLOCK"),
1305 '\u{0018}' => Some("CANCEL"),
1306 '\u{0019}' => Some("END OF MEDIUM"),
1307 '\u{001A}' => Some("SUBSTITUTE"),
1308 '\u{001B}' => Some("ESCAPE"),
1309 '\u{001C}' => Some("FILE SEPARATOR"),
1310 '\u{001D}' => Some("GROUP SEPARATOR"),
1311 '\u{001E}' => Some("RECORD SEPARATOR"),
1312 '\u{001F}' => Some("UNIT SEPARATOR"),
1313
1314 // C1 control characters and special (U+007F - U+00A0)
1315 '\u{007F}' => Some("DELETE"),
1316 '\u{0080}' => Some("PADDING CHARACTER"),
1317 '\u{0081}' => Some("HIGH OCTET PRESET"),
1318 '\u{0082}' => Some("BREAK PERMITTED HERE"),
1319 '\u{0083}' => Some("NO BREAK HERE"),
1320 '\u{0084}' => Some("INDEX"),
1321 '\u{0085}' => Some("NEXT LINE"),
1322 '\u{0086}' => Some("START OF SELECTED AREA"),
1323 '\u{0087}' => Some("END OF SELECTED AREA"),
1324 '\u{0088}' => Some("CHARACTER TABULATION SET"),
1325 '\u{0089}' => Some("CHARACTER TABULATION WITH JUSTIFICATION"),
1326 '\u{008A}' => Some("LINE TABULATION SET"),
1327 '\u{008B}' => Some("PARTIAL LINE FORWARD"),
1328 '\u{008C}' => Some("PARTIAL LINE BACKWARD"),
1329 '\u{008D}' => Some("REVERSE LINE FEED"),
1330 '\u{008E}' => Some("SINGLE SHIFT TWO"),
1331 '\u{008F}' => Some("SINGLE SHIFT THREE"),
1332 '\u{0090}' => Some("DEVICE CONTROL STRING"),
1333 '\u{0091}' => Some("PRIVATE USE ONE"),
1334 '\u{0092}' => Some("PRIVATE USE TWO"),
1335 '\u{0093}' => Some("SET TRANSMIT STATE"),
1336 '\u{0094}' => Some("CANCEL CHARACTER"),
1337 '\u{0095}' => Some("MESSAGE WAITING"),
1338 '\u{0096}' => Some("START OF GUARDED AREA"),
1339 '\u{0097}' => Some("END OF GUARDED AREA"),
1340 '\u{0098}' => Some("START OF STRING"),
1341 '\u{0099}' => Some("SINGLE GRAPHIC CHARACTER INTRODUCER"),
1342 '\u{009A}' => Some("SINGLE CHARACTER INTRODUCER"),
1343 '\u{009B}' => Some("CONTROL SEQUENCE INTRODUCER"),
1344 '\u{009C}' => Some("STRING TERMINATOR"),
1345 '\u{009D}' => Some("OPERATING SYSTEM COMMAND"),
1346 '\u{009E}' => Some("PRIVACY MESSAGE"),
1347 '\u{009F}' => Some("APPLICATION PROGRAM COMMAND"),
1348 '\u{00A0}' => Some("NO-BREAK SPACE"),
1349 '\u{00AD}' => Some("SOFT HYPHEN"),
1350
1351 // General punctuation - spaces (U+2000 - U+200A)
1352 '\u{2000}' => Some("EN QUAD"),
1353 '\u{2001}' => Some("EM QUAD"),
1354 '\u{2002}' => Some("EN SPACE"),
1355 '\u{2003}' => Some("EM SPACE"),
1356 '\u{2004}' => Some("THREE-PER-EM SPACE"),
1357 '\u{2005}' => Some("FOUR-PER-EM SPACE"),
1358 '\u{2006}' => Some("SIX-PER-EM SPACE"),
1359 '\u{2007}' => Some("FIGURE SPACE"),
1360 '\u{2008}' => Some("PUNCTUATION SPACE"),
1361 '\u{2009}' => Some("THIN SPACE"),
1362 '\u{200A}' => Some("HAIR SPACE"),
1363
1364 // Zero-width and formatting characters (U+200B - U+200F)
1365 '\u{200B}' => Some("ZERO WIDTH SPACE"),
1366 '\u{200C}' => Some("ZERO WIDTH NON-JOINER"),
1367 '\u{200D}' => Some("ZERO WIDTH JOINER"),
1368 '\u{200E}' => Some("LEFT-TO-RIGHT MARK"),
1369 '\u{200F}' => Some("RIGHT-TO-LEFT MARK"),
1370
1371 // Bidirectional text formatting (U+202A - U+202F)
1372 '\u{202A}' => Some("LEFT-TO-RIGHT EMBEDDING"),
1373 '\u{202B}' => Some("RIGHT-TO-LEFT EMBEDDING"),
1374 '\u{202C}' => Some("POP DIRECTIONAL FORMATTING"),
1375 '\u{202D}' => Some("LEFT-TO-RIGHT OVERRIDE"),
1376 '\u{202E}' => Some("RIGHT-TO-LEFT OVERRIDE"),
1377 '\u{202F}' => Some("NARROW NO-BREAK SPACE"),
1378
1379 // More formatting (U+2060 - U+206F)
1380 '\u{2060}' => Some("WORD JOINER"),
1381 '\u{2061}' => Some("FUNCTION APPLICATION"),
1382 '\u{2062}' => Some("INVISIBLE TIMES"),
1383 '\u{2063}' => Some("INVISIBLE SEPARATOR"),
1384 '\u{2064}' => Some("INVISIBLE PLUS"),
1385 '\u{2066}' => Some("LEFT-TO-RIGHT ISOLATE"),
1386 '\u{2067}' => Some("RIGHT-TO-LEFT ISOLATE"),
1387 '\u{2068}' => Some("FIRST STRONG ISOLATE"),
1388 '\u{2069}' => Some("POP DIRECTIONAL ISOLATE"),
1389 '\u{206A}' => Some("INHIBIT SYMMETRIC SWAPPING"),
1390 '\u{206B}' => Some("ACTIVATE SYMMETRIC SWAPPING"),
1391 '\u{206C}' => Some("INHIBIT ARABIC FORM SHAPING"),
1392 '\u{206D}' => Some("ACTIVATE ARABIC FORM SHAPING"),
1393 '\u{206E}' => Some("NATIONAL DIGIT SHAPES"),
1394 '\u{206F}' => Some("NOMINAL DIGIT SHAPES"),
1395
1396 // Other special spaces
1397 '\u{2028}' => Some("LINE SEPARATOR"),
1398 '\u{2029}' => Some("PARAGRAPH SEPARATOR"),
1399 '\u{205F}' => Some("MEDIUM MATHEMATICAL SPACE"),
1400 '\u{3000}' => Some("IDEOGRAPHIC SPACE"),
1401
1402 // Special characters
1403 '\u{034F}' => Some("COMBINING GRAPHEME JOINER"),
1404 '\u{061C}' => Some("ARABIC LETTER MARK"),
1405 '\u{115F}' => Some("HANGUL CHOSEONG FILLER"),
1406 '\u{1160}' => Some("HANGUL JUNGSEONG FILLER"),
1407 '\u{17B4}' => Some("KHMER VOWEL INHERENT AQ"),
1408 '\u{17B5}' => Some("KHMER VOWEL INHERENT AA"),
1409 '\u{180E}' => Some("MONGOLIAN VOWEL SEPARATOR"),
1410
1411 // BOM and special
1412 '\u{FEFF}' => Some("BYTE ORDER MARK"),
1413 '\u{FFFE}' => Some("NONCHARACTER"),
1414 '\u{FFFF}' => Some("NONCHARACTER"),
1415
1416 // Interlinear annotation
1417 '\u{FFF9}' => Some("INTERLINEAR ANNOTATION ANCHOR"),
1418 '\u{FFFA}' => Some("INTERLINEAR ANNOTATION SEPARATOR"),
1419 '\u{FFFB}' => Some("INTERLINEAR ANNOTATION TERMINATOR"),
1420
1421 // Tag characters (U+E0000 - U+E007F)
1422 '\u{E0001}' => Some("LANGUAGE TAG"),
1423 '\u{E0020}' => Some("TAG SPACE"),
1424
1425 _ => None,
1426 }
1427}