Skip to main content

libgraphql_parser/token_source/
str_to_graphql_token_source.rs

1//! A [`GraphQLTokenSource`] that lexes from a `&str` input.
2//!
3//! This lexer implements zero-copy lexing: token values borrow directly from
4//! the source string using `Cow::Borrowed`, avoiding allocations for names,
5//! numbers, and strings.
6//!
7//! # Features
8//!
9//! - **Zero-copy lexing**: Token values borrow from source text when possible
10//! - **Dual column tracking**: Reports both UTF-8 character positions (for
11//!   display) and UTF-16 code unit positions (for LSP compatibility)
12//! - **Comment preservation**: GraphQL `#` comments are captured as trivia
13//! - **Error recovery**: Invalid characters emit `Error` tokens, allowing the
14//!   lexer to continue and report multiple errors
15//!
16//! # Usage
17//!
18//! ```rust
19//! use libgraphql_parser::token_source::StrGraphQLTokenSource;
20//!
21//! let source = "{ name }";
22//! let lexer = StrGraphQLTokenSource::new(source);
23//! for token in lexer {
24//!     println!("{:?}", token.kind);
25//! }
26//! // Output:
27//! // CurlyBraceOpen
28//! // Name(Borrowed("name"))
29//! // CurlyBraceClose
30//! // Eof
31//! ```
32
33use crate::smallvec;
34use crate::token::GraphQLToken;
35use crate::token::GraphQLTokenKind;
36use crate::token::GraphQLTriviaToken;
37use crate::token::GraphQLTriviaTokenVec;
38use crate::GraphQLErrorNote;
39use crate::GraphQLSourceSpan;
40use crate::SourcePosition;
41use std::borrow::Cow;
42use std::path::Path;
43
44/// A [`GraphQLTokenSource`] that lexes from a `&str` input.
45///
46/// This lexer produces [`GraphQLToken`]s with zero-copy string values where
47/// possible. The `'src` lifetime ties token values to the source string.
48///
49/// See module documentation for details.
50pub struct StrGraphQLTokenSource<'src> {
51    /// The full source text being lexed.
52    source: &'src str,
53
54    /// Current byte offset from the start of `source`.
55    ///
56    /// The remaining text to lex is `&source[curr_byte_offset..]`.
57    curr_byte_offset: usize,
58
59    /// Current 0-based line number.
60    curr_line: usize,
61
62    /// Current UTF-8 character column (0-based).
63    ///
64    /// This counts characters, not bytes. For example, "🎉" (4 bytes) advances
65    /// this by 1.
66    curr_col_utf8: usize,
67
68    /// Current UTF-16 code unit column (0-based).
69    ///
70    /// Characters outside the Basic Multilingual Plane (U+10000 and above)
71    /// advance this by 2 (surrogate pair). For example, "🎉" (U+1F389) advances
72    /// this by 2.
73    curr_col_utf16: usize,
74
75    /// Whether the previous character was `\r`.
76    ///
77    /// Used to handle `\r\n` as a single newline: when we see `\r`, we set
78    /// this flag; if the next character is `\n`, we skip it without
79    /// incrementing the line number again.
80    last_char_was_cr: bool,
81
82    /// Trivia (comments, commas) accumulated before the next token.
83    pending_trivia: GraphQLTriviaTokenVec<'src>,
84
85    /// Whether the EOF token has been emitted.
86    finished: bool,
87
88    /// Optional file path for error messages and spans.
89    ///
90    /// When present, this is included in `GraphQLSourceSpan::file_path`.
91    /// Borrowed from the caller to avoid allocation.
92    file_path: Option<&'src Path>,
93}
94
95impl<'src> StrGraphQLTokenSource<'src> {
96    /// Creates a new token source from a string slice.
97    ///
98    /// # Example
99    ///
100    /// ```rust
101    /// # use libgraphql_parser::token_source::StrGraphQLTokenSource;
102    /// let lexer = StrGraphQLTokenSource::new("{ name }");
103    /// ```
104    pub fn new(source: &'src str) -> Self {
105        Self {
106            source,
107            curr_byte_offset: 0,
108            curr_line: 0,
109            curr_col_utf8: 0,
110            curr_col_utf16: 0,
111            last_char_was_cr: false,
112            pending_trivia: smallvec![],
113            finished: false,
114            file_path: None,
115        }
116    }
117
118    /// Creates a new token source with an associated file path.
119    ///
120    /// The file path is included in token spans for error reporting.
121    pub fn with_file_path(source: &'src str, path: &'src Path) -> Self {
122        Self {
123            source,
124            curr_byte_offset: 0,
125            curr_line: 0,
126            curr_col_utf8: 0,
127            curr_col_utf16: 0,
128            last_char_was_cr: false,
129            pending_trivia: smallvec![],
130            finished: false,
131            file_path: Some(path),
132        }
133    }
134
135    // =========================================================================
136    // Position and scanning helpers
137    // =========================================================================
138
139    /// Returns the remaining source text to be lexed.
140    fn remaining(&self) -> &'src str {
141        &self.source[self.curr_byte_offset..]
142    }
143
144    /// Returns the current source position.
145    fn curr_position(&self) -> SourcePosition {
146        SourcePosition::new(
147            self.curr_line,
148            self.curr_col_utf8,
149            Some(self.curr_col_utf16),
150            self.curr_byte_offset,
151        )
152    }
153
154    /// Peeks at the next character without consuming it.
155    ///
156    /// Returns `None` if at end of input.
157    ///
158    /// # Performance (B1 in benchmark-optimizations.md)
159    ///
160    /// This uses direct byte access with an ASCII fast path instead
161    /// of the naive `remaining().chars().next()`. GraphQL source text
162    /// is overwhelmingly ASCII (names, keywords, punctuators,
163    /// whitespace), so the fast path covers >99% of calls. The
164    /// non-ASCII fallback (Unicode in string literals/comments) is
165    /// rare and can remain slow.
166    ///
167    /// Without this optimization, every peek would construct a
168    /// `Chars` iterator and decode the first UTF-8 sequence — a
169    /// measurable cost given that peek is called millions of times
170    /// for large inputs.
171    #[inline]
172    fn peek_char(&self) -> Option<char> {
173        let bytes = self.source.as_bytes();
174        if self.curr_byte_offset >= bytes.len() {
175            return None;
176        }
177        let b = bytes[self.curr_byte_offset];
178        if b.is_ascii() {
179            // Fast path: single-byte ASCII character (covers >99%
180            // of GraphQL source text).
181            Some(b as char)
182        } else {
183            // Slow path: multi-byte UTF-8 character. Fall back to
184            // full UTF-8 decoding. This only triggers inside
185            // string literals or comments containing non-ASCII
186            // characters.
187            self.source[self.curr_byte_offset..].chars().next()
188        }
189    }
190
191    /// Peeks at the nth character ahead without consuming.
192    ///
193    /// `peek_char_nth(0)` is equivalent to `peek_char()`.
194    /// Returns `None` if there aren't enough characters remaining.
195    ///
196    /// Note: Unlike `peek_char()`, this still uses the iterator
197    /// approach since it needs to skip over variable-width UTF-8
198    /// characters to reach position n. This method is only called
199    /// in a few places for multi-character lookahead (e.g., number
200    /// parsing to check digit after `.`), so it is not a hot path.
201    fn peek_char_nth(&self, n: usize) -> Option<char> {
202        self.remaining().chars().nth(n)
203    }
204
205    /// Consumes the next character and updates position tracking.
206    ///
207    /// Returns `None` if at end of input.
208    ///
209    /// This method handles:
210    /// - Advancing byte offset by the character's UTF-8 length
211    /// - Incrementing line number on newlines (`\n`, `\r`, `\r\n`)
212    /// - Tracking UTF-8 character column and UTF-16 code unit column
213    ///
214    /// # Performance (B1 in benchmark-optimizations.md)
215    ///
216    /// Uses an ASCII fast path: if the current byte is <0x80, we
217    /// know it is exactly 1 byte, 1 UTF-8 column, and 1 UTF-16
218    /// code unit, so we avoid calling `ch.len_utf8()` and
219    /// `ch.len_utf16()`. The slow path handles multi-byte UTF-8
220    /// sequences.
221    fn consume(&mut self) -> Option<char> {
222        let bytes = self.source.as_bytes();
223        if self.curr_byte_offset >= bytes.len() {
224            return None;
225        }
226
227        let b = bytes[self.curr_byte_offset];
228
229        if b.is_ascii() {
230            // ASCII fast path: 1 byte, 1 UTF-8 col, 1 UTF-16 unit
231            let ch = b as char;
232
233            if ch == '\n' {
234                if self.last_char_was_cr {
235                    self.last_char_was_cr = false;
236                } else {
237                    self.curr_line += 1;
238                    self.curr_col_utf8 = 0;
239                    self.curr_col_utf16 = 0;
240                }
241            } else if ch == '\r' {
242                self.curr_line += 1;
243                self.curr_col_utf8 = 0;
244                self.curr_col_utf16 = 0;
245                self.last_char_was_cr = true;
246            } else {
247                self.curr_col_utf8 += 1;
248                self.curr_col_utf16 += 1;
249                self.last_char_was_cr = false;
250            }
251
252            self.curr_byte_offset += 1;
253            Some(ch)
254        } else {
255            // Multi-byte UTF-8 character (non-ASCII). This only
256            // occurs inside string literals or comments containing
257            // Unicode characters. We fall back to full char
258            // decoding to get the correct byte length and UTF-16
259            // length.
260            let ch = self.source[self.curr_byte_offset..]
261                .chars()
262                .next()
263                .unwrap();
264            let byte_len = ch.len_utf8();
265
266            // Non-ASCII characters are never newlines, so always
267            // advance columns.
268            self.curr_col_utf8 += 1;
269            self.curr_col_utf16 += ch.len_utf16();
270            self.last_char_was_cr = false;
271
272            self.curr_byte_offset += byte_len;
273            Some(ch)
274        }
275    }
276
277    /// Creates a `GraphQLSourceSpan` from a start position to the current
278    /// position.
279    fn make_span(&self, start: SourcePosition) -> GraphQLSourceSpan {
280        let end = self.curr_position();
281        if let Some(path) = self.file_path {
282            GraphQLSourceSpan::with_file(start, end, path.to_path_buf())
283        } else {
284            GraphQLSourceSpan::new(start, end)
285        }
286    }
287
288    // =========================================================================
289    // Token creation helpers
290    // =========================================================================
291
292    /// Creates a token with the accumulated trivia.
293    fn make_token(
294        &mut self,
295        kind: GraphQLTokenKind<'src>,
296        span: GraphQLSourceSpan,
297    ) -> GraphQLToken<'src> {
298        GraphQLToken {
299            kind,
300            preceding_trivia: std::mem::take(&mut self.pending_trivia),
301            span,
302        }
303    }
304
305    // =========================================================================
306    // Lexer main loop
307    // =========================================================================
308
309    /// Advances to the next token, skipping whitespace and collecting trivia.
310    fn next_token(&mut self) -> GraphQLToken<'src> {
311        loop {
312            // Skip whitespace
313            self.skip_whitespace();
314
315            let start = self.curr_position();
316
317            match self.peek_char() {
318                None => {
319                    // End of input
320                    let span = self.make_span(start);
321                    return self.make_token(GraphQLTokenKind::Eof, span);
322                }
323
324                Some('#') => {
325                    // Comment - collect as trivia and continue
326                    self.lex_comment(start);
327                    continue;
328                }
329
330                Some(',') => {
331                    // Comma - collect as trivia and continue
332                    self.consume();
333                    let span = self.make_span(start);
334                    self.pending_trivia
335                        .push(GraphQLTriviaToken::Comma { span });
336                    continue;
337                }
338
339                // Single-character punctuators
340                Some('!') => {
341                    self.consume();
342                    let span = self.make_span(start);
343                    return self.make_token(GraphQLTokenKind::Bang, span);
344                }
345                Some('$') => {
346                    self.consume();
347                    let span = self.make_span(start);
348                    return self.make_token(GraphQLTokenKind::Dollar, span);
349                }
350                Some('&') => {
351                    self.consume();
352                    let span = self.make_span(start);
353                    return self.make_token(GraphQLTokenKind::Ampersand, span);
354                }
355                Some('(') => {
356                    self.consume();
357                    let span = self.make_span(start);
358                    return self.make_token(GraphQLTokenKind::ParenOpen, span);
359                }
360                Some(')') => {
361                    self.consume();
362                    let span = self.make_span(start);
363                    return self.make_token(GraphQLTokenKind::ParenClose, span);
364                }
365                Some(':') => {
366                    self.consume();
367                    let span = self.make_span(start);
368                    return self.make_token(GraphQLTokenKind::Colon, span);
369                }
370                Some('=') => {
371                    self.consume();
372                    let span = self.make_span(start);
373                    return self.make_token(GraphQLTokenKind::Equals, span);
374                }
375                Some('@') => {
376                    self.consume();
377                    let span = self.make_span(start);
378                    return self.make_token(GraphQLTokenKind::At, span);
379                }
380                Some('[') => {
381                    self.consume();
382                    let span = self.make_span(start);
383                    return self.make_token(GraphQLTokenKind::SquareBracketOpen, span);
384                }
385                Some(']') => {
386                    self.consume();
387                    let span = self.make_span(start);
388                    return self.make_token(GraphQLTokenKind::SquareBracketClose, span);
389                }
390                Some('{') => {
391                    self.consume();
392                    let span = self.make_span(start);
393                    return self.make_token(GraphQLTokenKind::CurlyBraceOpen, span);
394                }
395                Some('}') => {
396                    self.consume();
397                    let span = self.make_span(start);
398                    return self.make_token(GraphQLTokenKind::CurlyBraceClose, span);
399                }
400                Some('|') => {
401                    self.consume();
402                    let span = self.make_span(start);
403                    return self.make_token(GraphQLTokenKind::Pipe, span);
404                }
405
406                // Ellipsis or dot error
407                Some('.') => {
408                    return self.lex_dot_or_ellipsis(start);
409                }
410
411                // String literals
412                Some('"') => {
413                    return self.lex_string(start);
414                }
415
416                // Names and keywords
417                Some(c) if is_name_start(c) => {
418                    return self.lex_name(start);
419                }
420
421                // Numbers (including negative)
422                Some(c) if c == '-' || c.is_ascii_digit() => {
423                    return self.lex_number(start);
424                }
425
426                // Invalid character
427                Some(_) => {
428                    return self.lex_invalid_character(start);
429                }
430            }
431        }
432    }
433
434    // =========================================================================
435    // Whitespace handling
436    // =========================================================================
437
438    /// Skips whitespace characters.
439    ///
440    /// Per the GraphQL spec, these are "ignored tokens":
441    /// - Space (U+0020)
442    /// - Tab (U+0009)
443    /// - Line terminators: LF (U+000A), CR (U+000D), CRLF
444    /// - BOM (U+FEFF) - Unicode BOM is ignored anywhere in the document
445    ///
446    /// See: <https://spec.graphql.org/September2025/#sec-Language.Source-Text.Unicode>
447    ///
448    /// Note: Comma is also whitespace in GraphQL but we handle it separately
449    /// to preserve it as trivia.
450    ///
451    /// # Performance (B2 in benchmark-optimizations.md)
452    ///
453    /// Uses byte-scanning instead of per-character `consume()`
454    /// calls. Each `consume()` does 5-6 field updates (peek,
455    /// newline check, col_utf8, col_utf16, last_char_was_cr,
456    /// byte_offset). Byte scanning does one branch per byte and
457    /// batch-updates position state once at the end.
458    ///
459    /// Without this optimization, skipping 4 spaces (typical
460    /// indentation) would do ~24 field updates. With byte
461    /// scanning: 4 byte comparisons + ~5 batch updates.
462    fn skip_whitespace(&mut self) {
463        let bytes = self.source.as_bytes();
464        let mut i = self.curr_byte_offset;
465        let mut last_newline_byte_pos: Option<usize> = None;
466        let mut lines_added: usize = 0;
467        let mut last_was_cr = self.last_char_was_cr;
468        // Track BOM count since last newline so we can compute
469        // character columns (BOM is 3 bytes but 1 column).
470        let mut bom_after_last_nl: usize = 0;
471
472        loop {
473            if i >= bytes.len() {
474                break;
475            }
476            match bytes[i] {
477                b' ' | b'\t' => {
478                    last_was_cr = false;
479                    i += 1;
480                },
481                b'\n' => {
482                    if !last_was_cr {
483                        lines_added += 1;
484                    }
485                    last_was_cr = false;
486                    last_newline_byte_pos = Some(i);
487                    bom_after_last_nl = 0;
488                    i += 1;
489                },
490                b'\r' => {
491                    lines_added += 1;
492                    last_was_cr = true;
493                    last_newline_byte_pos = Some(i);
494                    bom_after_last_nl = 0;
495                    i += 1;
496                },
497                // BOM: U+FEFF = 0xEF 0xBB 0xBF in UTF-8.
498                // Rare in practice but must be handled correctly.
499                0xEF if i + 2 < bytes.len()
500                    && bytes[i + 1] == 0xBB
501                    && bytes[i + 2] == 0xBF => {
502                    last_was_cr = false;
503                    bom_after_last_nl += 1;
504                    i += 3;
505                },
506                _ => break,
507            }
508        }
509
510        if i == self.curr_byte_offset {
511            return;
512        }
513
514        // Batch-update position state.
515        self.curr_line += lines_added;
516        self.last_char_was_cr = last_was_cr;
517
518        if let Some(nl_pos) = last_newline_byte_pos {
519            // Column resets after a newline. Count characters
520            // from after the last newline to the current
521            // position. For ASCII whitespace, bytes = chars.
522            // Each BOM contributes 3 bytes but only 1 column.
523            let bytes_after_nl = i - (nl_pos + 1);
524            let col = bytes_after_nl - bom_after_last_nl * 2;
525            self.curr_col_utf8 = col;
526            self.curr_col_utf16 = col;
527        } else {
528            // No newlines in this whitespace run — advance
529            // columns from current position. Each BOM
530            // contributes 3 bytes but only 1 column.
531            let consumed_bytes = i - self.curr_byte_offset;
532            let col_advance =
533                consumed_bytes - bom_after_last_nl * 2;
534            self.curr_col_utf8 += col_advance;
535            self.curr_col_utf16 += col_advance;
536        }
537
538        self.curr_byte_offset = i;
539    }
540
541    // =========================================================================
542    // Comment lexing
543    // =========================================================================
544
545    /// Lexes a comment and adds it to pending trivia.
546    ///
547    /// A comment starts with `#` and extends to the end of the line.
548    ///
549    /// # Performance (B2 in benchmark-optimizations.md)
550    ///
551    /// Uses byte-scanning to find end-of-line instead of
552    /// per-character `peek_char()` + `consume()`. Comments never
553    /// span multiple lines, so line number doesn't change — only
554    /// the column advances. Column is computed once at the end
555    /// via `compute_columns_for_span()` (with an ASCII fast path
556    /// for the common case).
557    fn lex_comment(&mut self, start: SourcePosition) {
558        // Consume the '#' (single ASCII byte).
559        self.curr_byte_offset += 1;
560        self.curr_col_utf8 += 1;
561        self.curr_col_utf16 += 1;
562        self.last_char_was_cr = false;
563
564        let content_start = self.curr_byte_offset;
565        let bytes = self.source.as_bytes();
566
567        // Byte-scan to end of line or EOF. All line-ending bytes
568        // (\n = 0x0A, \r = 0x0D) are ASCII, so they can never
569        // appear as continuation bytes in multi-byte UTF-8
570        // sequences. This makes byte-scanning safe even when the
571        // comment contains Unicode characters.
572        let mut i = content_start;
573        while i < bytes.len()
574            && bytes[i] != b'\n'
575            && bytes[i] != b'\r' {
576            i += 1;
577        }
578
579        // Batch-update column for the comment content.
580        // Comments are single-line, so only column advances.
581        let (col_utf8, col_utf16) =
582            compute_columns_for_span(
583                &self.source[content_start..i],
584            );
585        self.curr_col_utf8 += col_utf8;
586        self.curr_col_utf16 += col_utf16;
587        self.curr_byte_offset = i;
588
589        let content = &self.source[content_start..i];
590        let span = self.make_span(start);
591
592        self.pending_trivia.push(GraphQLTriviaToken::Comment {
593            value: Cow::Borrowed(content),
594            span,
595        });
596    }
597
598    // =========================================================================
599    // Dot / Ellipsis lexing
600    // =========================================================================
601
602    /// Lexes dots, producing either an Ellipsis token or an error.
603    ///
604    /// This implements a state machine for dot handling similar to
605    /// `RustMacroGraphQLTokenSource`:
606    /// - `...` (adjacent) → `Ellipsis`
607    /// - `.` alone → Error (no hint - could be many things like `Foo.Bar`)
608    /// - `..` (adjacent) → Error with help to add third dot
609    /// - `. .` (spaced, same line) → Error with help about spacing
610    /// - `.. .` (first two adjacent, third spaced) → Error with help about
611    ///   spacing
612    /// - `. ..` (first spaced, last two adjacent) → Error with help about
613    ///   spacing
614    /// - `. . .` (all spaced, same line) → Error with help about spacing
615    /// - Dots on different lines → Separate errors
616    ///
617    /// TODO: Look for patterns like `{Name}.{Name}` and give a useful error
618    /// hint (e.g., user may have been trying to use enum syntax incorrectly).
619    fn lex_dot_or_ellipsis(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
620        let first_dot_line = self.curr_line;
621
622        // Consume first dot
623        self.consume();
624
625        // Check for second dot (may be adjacent or spaced)
626        self.skip_whitespace_same_line();
627
628        match self.peek_char() {
629            Some('.') if self.curr_line == first_dot_line => {
630                let second_dot_start = self.curr_position();
631                let first_two_adjacent = second_dot_start.byte_offset() == start.byte_offset() + 1;
632                self.consume();
633
634                // Check for third dot
635                self.skip_whitespace_same_line();
636
637                match self.peek_char() {
638                    Some('.') if self.curr_line == first_dot_line => {
639                        let third_dot_start = self.curr_position();
640                        self.consume();
641                        let span = self.make_span(start);
642
643                        // Check if all three dots were adjacent (no whitespace)
644                        let second_third_adjacent =
645                            third_dot_start.byte_offset() == second_dot_start.byte_offset() + 1;
646
647                        if first_two_adjacent && second_third_adjacent {
648                            // All adjacent - valid ellipsis
649                            self.make_token(GraphQLTokenKind::Ellipsis, span)
650                        } else if first_two_adjacent {
651                            // `.. .` - first two adjacent, third spaced
652                            let kind = GraphQLTokenKind::Error {
653                                message: "Unexpected `.. .`".to_string(),
654                                error_notes: smallvec![GraphQLErrorNote::help(
655                                    "This `.` may have been intended to complete a `...` spread \
656                                     operator. Try removing the extra spacing between the dots."
657                                )],
658                            };
659                            self.make_token(kind, span)
660                        } else if second_third_adjacent {
661                            // `. ..` - first spaced, last two adjacent
662                            let kind = GraphQLTokenKind::Error {
663                                message: "Unexpected `. ..`".to_string(),
664                                error_notes: smallvec![GraphQLErrorNote::help(
665                                    "These dots may have been intended to form a `...` spread \
666                                     operator. Try removing the extra spacing between the dots."
667                                )],
668                            };
669                            self.make_token(kind, span)
670                        } else {
671                            // `. . .` - all spaced
672                            let kind = GraphQLTokenKind::Error {
673                                message: "Unexpected `. . .`".to_string(),
674                                error_notes: smallvec![GraphQLErrorNote::help(
675                                    "These dots may have been intended to form a `...` spread \
676                                     operator. Try removing the extra spacing between the dots."
677                                )],
678                            };
679                            self.make_token(kind, span)
680                        }
681                    }
682                    _ => {
683                        // Only two dots found on this line
684                        let span = self.make_span(start);
685
686                        if first_two_adjacent {
687                            // Adjacent `..` - suggest adding third dot
688                            let kind = GraphQLTokenKind::Error {
689                                message: "Unexpected `..` (use `...` for spread operator)"
690                                    .to_string(),
691                                error_notes: smallvec![GraphQLErrorNote::help(
692                                    "Add one more `.` to form the spread operator `...`"
693                                )],
694                            };
695                            self.make_token(kind, span)
696                        } else {
697                            // Spaced `. .` - suggest removing spacing
698                            let kind = GraphQLTokenKind::Error {
699                                message: "Unexpected `. .` (use `...` for spread operator)"
700                                    .to_string(),
701                                error_notes: smallvec![GraphQLErrorNote::help(
702                                    "These dots may have been intended to form a `...` spread \
703                                     operator. Try removing the extra spacing between the dots."
704                                )],
705                            };
706                            self.make_token(kind, span)
707                        }
708                    }
709                }
710            }
711            _ => {
712                // Single dot (or dots on different lines)
713                // Don't assume it was meant to be ellipsis - could be `Foo.Bar` style
714                let span = self.make_span(start);
715                let kind = GraphQLTokenKind::Error {
716                    message: "Unexpected `.`".to_string(),
717                    error_notes: smallvec![],
718                };
719                self.make_token(kind, span)
720            }
721        }
722    }
723
724    /// Skips whitespace but only on the same line.
725    ///
726    /// Used for dot consolidation - we only merge dots that are on the same
727    /// line.
728    fn skip_whitespace_same_line(&mut self) {
729        while let Some(ch) = self.peek_char() {
730            match ch {
731                ' ' | '\t' | '\u{FEFF}' => {
732                    self.consume();
733                }
734                _ => break,
735            }
736        }
737    }
738
739    // =========================================================================
740    // Name lexing
741    // =========================================================================
742
743    /// Lexes a name or keyword.
744    ///
745    /// Names match the pattern: `/[_A-Za-z][_0-9A-Za-z]*/`
746    ///
747    /// Keywords `true`, `false`, and `null` are emitted as distinct token
748    /// kinds.
749    ///
750    /// # Performance (B2 in benchmark-optimizations.md)
751    ///
752    /// Uses byte-scanning to find the end of the name in a tight
753    /// loop (one byte comparison per iteration), then updates
754    /// position tracking once for the entire name. This avoids
755    /// calling `consume()` per character, which would do 5-6 field
756    /// updates per character (peek, newline check, col_utf8,
757    /// col_utf16, last_char_was_cr, byte_offset).
758    ///
759    /// This is safe because GraphQL names are ASCII-only by spec
760    /// (`[_A-Za-z][_0-9A-Za-z]*`) and never contain newlines, so:
761    /// - Every byte is exactly one character
762    /// - Column advances by the number of bytes consumed
763    /// - Line number never changes
764    /// - `last_char_was_cr` is always cleared
765    fn lex_name(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
766        let name_start = self.curr_byte_offset;
767        let bytes = self.source.as_bytes();
768
769        // Byte-scan: skip first char (already validated as name
770        // start) and continue while bytes match [_0-9A-Za-z].
771        let mut i = name_start + 1;
772        while i < bytes.len() && is_name_continue_byte(bytes[i]) {
773            i += 1;
774        }
775
776        let name_len = i - name_start;
777
778        // Batch-update position: names are ASCII-only, no
779        // newlines, so column advances by name length and line
780        // stays the same.
781        self.curr_byte_offset = i;
782        self.curr_col_utf8 += name_len;
783        self.curr_col_utf16 += name_len;
784        self.last_char_was_cr = false;
785
786        let name = &self.source[name_start..i];
787        let span = self.make_span(start);
788
789        // Check for keywords
790        let kind = match name {
791            "true" => GraphQLTokenKind::True,
792            "false" => GraphQLTokenKind::False,
793            "null" => GraphQLTokenKind::Null,
794            _ => GraphQLTokenKind::name_borrowed(name),
795        };
796
797        self.make_token(kind, span)
798    }
799
800    // =========================================================================
801    // Number lexing
802    // =========================================================================
803
804    /// Lexes an integer or float literal.
805    ///
806    /// Handles:
807    /// - Optional negative sign: `-`
808    /// - Integer part: `0` or `[1-9][0-9]*`
809    /// - Optional decimal part: `.[0-9]+`
810    /// - Optional exponent: `[eE][+-]?[0-9]+`
811    fn lex_number(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
812        let num_start = self.curr_byte_offset;
813        let mut is_float = false;
814
815        // Optional negative sign
816        if self.peek_char() == Some('-') {
817            self.consume();
818        }
819
820        // Integer part
821        match self.peek_char() {
822            Some('0') => {
823                self.consume();
824                // Check for invalid leading zeros (e.g., 00, 01)
825                if let Some(ch) = self.peek_char()
826                    && ch.is_ascii_digit() {
827                    // Invalid: leading zeros
828                    return self.lex_number_error(
829                        start,
830                        num_start,
831                        "Invalid number: leading zeros are not allowed",
832                        Some("https://spec.graphql.org/September2025/#sec-Int-Value"),
833                    );
834                }
835            }
836            Some(ch) if ch.is_ascii_digit() => {
837                // Non-zero start
838                self.consume();
839                while let Some(ch) = self.peek_char() {
840                    if ch.is_ascii_digit() {
841                        self.consume();
842                    } else {
843                        break;
844                    }
845                }
846            }
847            Some(_) | None => {
848                // Just a `-` with no digits
849                let span = self.make_span(start);
850                let kind = GraphQLTokenKind::Error {
851                    message: "Unexpected `-`".to_string(),
852                    error_notes: smallvec![],
853                };
854                return self.make_token(kind, span);
855            }
856        }
857
858        // Optional decimal part
859        if self.peek_char() == Some('.') {
860            // Check that the next character is a digit (not another dot for `...`)
861            if let Some(ch) = self.peek_char_nth(1)
862                && ch.is_ascii_digit() {
863                is_float = true;
864                self.consume(); // consume the '.'
865
866                // Consume decimal digits
867                while let Some(ch) = self.peek_char() {
868                    if ch.is_ascii_digit() {
869                        self.consume();
870                    } else {
871                        break;
872                    }
873                }
874            }
875        }
876
877        // Optional exponent part
878        if let Some(ch) = self.peek_char()
879            && (ch == 'e' || ch == 'E') {
880            is_float = true;
881            self.consume();
882
883            // Optional sign
884            if let Some(ch) = self.peek_char()
885                && (ch == '+' || ch == '-') {
886                self.consume();
887            }
888
889            // Exponent digits (required)
890            let has_exponent_digits = matches!(self.peek_char(), Some(ch) if ch.is_ascii_digit());
891            if !has_exponent_digits {
892                return self.lex_number_error(
893                    start,
894                    num_start,
895                    "Invalid number: exponent must have at least one digit",
896                    Some("https://spec.graphql.org/September2025/#sec-Float-Value"),
897                );
898            }
899
900            while let Some(ch) = self.peek_char() {
901                if ch.is_ascii_digit() {
902                    self.consume();
903                } else {
904                    break;
905                }
906            }
907        }
908
909        let num_end = self.curr_byte_offset;
910        let num_text = &self.source[num_start..num_end];
911        let span = self.make_span(start);
912
913        let kind = if is_float {
914            GraphQLTokenKind::float_value_borrowed(num_text)
915        } else {
916            GraphQLTokenKind::int_value_borrowed(num_text)
917        };
918
919        self.make_token(kind, span)
920    }
921
922    /// Creates an error token for an invalid number.
923    fn lex_number_error(
924        &mut self,
925        start: SourcePosition,
926        num_start: usize,
927        message: &str,
928        spec_url: Option<&str>,
929    ) -> GraphQLToken<'src> {
930        // Consume remaining number-like characters to provide better error recovery
931        while let Some(ch) = self.peek_char() {
932            if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' {
933                self.consume();
934            } else {
935                break;
936            }
937        }
938
939        let num_end = self.curr_byte_offset;
940        let invalid_text = &self.source[num_start..num_end];
941        let span = self.make_span(start);
942
943        let mut error_notes = smallvec![];
944        if let Some(url) = spec_url {
945            error_notes.push(GraphQLErrorNote::spec(url));
946        }
947
948        let kind = GraphQLTokenKind::Error {
949            message: format!("{message}: `{invalid_text}`"),
950            error_notes,
951        };
952
953        self.make_token(kind, span)
954    }
955
956    // =========================================================================
957    // String lexing
958    // =========================================================================
959
960    /// Lexes a string literal (single-line or block string).
961    fn lex_string(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
962        let str_start = self.curr_byte_offset;
963
964        // Check for block string (""")
965        if self.remaining().starts_with("\"\"\"") {
966            return self.lex_block_string(start, str_start);
967        }
968
969        // Single-line string
970        self.consume(); // consume opening "
971
972        loop {
973            match self.peek_char() {
974                None => {
975                    // Unterminated string
976                    let span = self.make_span(start.clone());
977                    let kind = GraphQLTokenKind::Error {
978                        message: "Unterminated string literal".to_string(),
979                        error_notes: smallvec![
980                            GraphQLErrorNote::general_with_span(
981                                "String started here",
982                                self.make_span(start),
983                            ),
984                            GraphQLErrorNote::help("Add closing `\"`"),
985                        ],
986                    };
987                    return self.make_token(kind, span);
988                }
989                Some('\n') | Some('\r') => {
990                    // Unescaped newline in single-line string - consume it so
991                    // the span includes the newline character
992                    self.consume();
993                    // Also consume \n if this was \r\n
994                    if self.last_char_was_cr && self.peek_char() == Some('\n') {
995                        self.consume();
996                    }
997                    let span = self.make_span(start);
998                    let kind = GraphQLTokenKind::Error {
999                        message: "Unterminated string literal".to_string(),
1000                        error_notes: smallvec![
1001                            GraphQLErrorNote::general(
1002                                "Single-line strings cannot contain unescaped newlines"
1003                            ),
1004                            GraphQLErrorNote::help("Use a block string (triple quotes) for multi-line strings, or escape the newline with `\\n`"),
1005                        ],
1006                    };
1007                    return self.make_token(kind, span);
1008                }
1009                Some('"') => {
1010                    // End of string
1011                    self.consume();
1012                    break;
1013                }
1014                Some('\\') => {
1015                    // Escape sequence - consume backslash and next character
1016                    self.consume();
1017                    if self.peek_char().is_some() {
1018                        self.consume();
1019                    }
1020                }
1021                Some(_) => {
1022                    self.consume();
1023                }
1024            }
1025        }
1026
1027        let str_end = self.curr_byte_offset;
1028        let string_text = &self.source[str_start..str_end];
1029        let span = self.make_span(start);
1030
1031        self.make_token(GraphQLTokenKind::string_value_borrowed(string_text), span)
1032    }
1033
1034    /// Lexes a block string literal.
1035    ///
1036    /// # Performance (B2 in benchmark-optimizations.md)
1037    ///
1038    /// Uses byte-scanning instead of per-character
1039    /// `peek_char()`/`consume()` calls. The scan loop checks
1040    /// each byte against the special characters (`"`, `\`, `\n`,
1041    /// `\r`) and skips everything else with a single `i += 1`.
1042    /// Position is batch-updated once at the end.
1043    ///
1044    /// This is safe for multi-byte UTF-8 content because the
1045    /// sentinel bytes (`"` = 0x22, `\` = 0x5C, `\n` = 0x0A,
1046    /// `\r` = 0x0D) are all ASCII (<0x80) and can never appear
1047    /// as continuation bytes in multi-byte UTF-8 sequences
1048    /// (which are always >=0x80).
1049    fn lex_block_string(
1050        &mut self,
1051        start: SourcePosition,
1052        str_start: usize,
1053    ) -> GraphQLToken<'src> {
1054        let bytes = self.source.as_bytes();
1055        let scan_start = self.curr_byte_offset;
1056
1057        // Skip opening """ (3 ASCII bytes, caller verified).
1058        let mut i = scan_start + 3;
1059        let mut lines_added: usize = 0;
1060        let mut last_newline_byte_pos: Option<usize> = None;
1061        let mut last_was_cr = false;
1062
1063        let found_close = loop {
1064            if i >= bytes.len() {
1065                break false;
1066            }
1067
1068            match bytes[i] {
1069                b'"' if i + 2 < bytes.len()
1070                    && bytes[i + 1] == b'"'
1071                    && bytes[i + 2] == b'"' =>
1072                {
1073                    // Closing """.
1074                    i += 3;
1075                    last_was_cr = false;
1076                    break true;
1077                },
1078                b'\\' if i + 3 < bytes.len()
1079                    && bytes[i + 1] == b'"'
1080                    && bytes[i + 2] == b'"'
1081                    && bytes[i + 3] == b'"' =>
1082                {
1083                    // Escaped triple quote \""".
1084                    last_was_cr = false;
1085                    i += 4;
1086                },
1087                b'\n' => {
1088                    if !last_was_cr {
1089                        lines_added += 1;
1090                    }
1091                    last_was_cr = false;
1092                    last_newline_byte_pos = Some(i);
1093                    i += 1;
1094                },
1095                b'\r' => {
1096                    lines_added += 1;
1097                    last_was_cr = true;
1098                    last_newline_byte_pos = Some(i);
1099                    i += 1;
1100                },
1101                _ => {
1102                    last_was_cr = false;
1103                    i += 1;
1104                },
1105            }
1106        };
1107
1108        // Batch-update position state.
1109        self.curr_line += lines_added;
1110        self.last_char_was_cr = last_was_cr;
1111
1112        if let Some(nl_pos) = last_newline_byte_pos {
1113            // Column resets after the last newline.
1114            let (col_utf8, col_utf16) =
1115                compute_columns_for_span(
1116                    &self.source[nl_pos + 1..i],
1117                );
1118            self.curr_col_utf8 = col_utf8;
1119            self.curr_col_utf16 = col_utf16;
1120        } else {
1121            // No newlines — advance columns from current
1122            // position.
1123            let (col_utf8, col_utf16) =
1124                compute_columns_for_span(
1125                    &self.source
1126                        [self.curr_byte_offset..i],
1127                );
1128            self.curr_col_utf8 += col_utf8;
1129            self.curr_col_utf16 += col_utf16;
1130        }
1131
1132        self.curr_byte_offset = i;
1133
1134        if !found_close {
1135            // Unterminated block string.
1136            let span = self.make_span(start.clone());
1137            let kind = GraphQLTokenKind::Error {
1138                message: "Unterminated block string"
1139                    .to_string(),
1140                error_notes: smallvec![
1141                    GraphQLErrorNote::general_with_span(
1142                        "Block string started here",
1143                        self.make_span(start),
1144                    ),
1145                    GraphQLErrorNote::help(
1146                        "Add closing `\"\"\"`",
1147                    ),
1148                ],
1149            };
1150            return self.make_token(kind, span);
1151        }
1152
1153        let str_end = self.curr_byte_offset;
1154        let string_text = &self.source[str_start..str_end];
1155        let span = self.make_span(start);
1156
1157        self.make_token(
1158            GraphQLTokenKind::string_value_borrowed(
1159                string_text,
1160            ),
1161            span,
1162        )
1163    }
1164
1165    // =========================================================================
1166    // Invalid character handling
1167    // =========================================================================
1168
1169    /// Lexes an invalid character, producing an error token.
1170    fn lex_invalid_character(&mut self, start: SourcePosition) -> GraphQLToken<'src> {
1171        let ch = self.consume().unwrap();
1172        let span = self.make_span(start);
1173
1174        let kind = GraphQLTokenKind::Error {
1175            message: format!("Unexpected character {}", describe_char(ch)),
1176            error_notes: smallvec![],
1177        };
1178
1179        self.make_token(kind, span)
1180    }
1181}
1182
1183// =============================================================================
1184// Iterator implementation
1185// =============================================================================
1186
1187impl<'src> Iterator for StrGraphQLTokenSource<'src> {
1188    type Item = GraphQLToken<'src>;
1189
1190    fn next(&mut self) -> Option<Self::Item> {
1191        if self.finished {
1192            return None;
1193        }
1194
1195        let token = self.next_token();
1196
1197        if matches!(token.kind, GraphQLTokenKind::Eof) {
1198            self.finished = true;
1199        }
1200
1201        Some(token)
1202    }
1203}
1204
1205// =============================================================================
1206// Helper functions
1207// =============================================================================
1208
1209/// Returns `true` if `ch` can start a GraphQL name.
1210///
1211/// Per the GraphQL spec, names start with `NameStart`:
1212/// <https://spec.graphql.org/September2025/#NameStart>
1213fn is_name_start(ch: char) -> bool {
1214    ch == '_' || ch.is_ascii_alphabetic()
1215}
1216
1217/// Returns `true` if `b` can continue a GraphQL name.
1218///
1219/// Per the GraphQL spec, names continue with `NameContinue`:
1220/// <https://spec.graphql.org/September2025/#NameContinue>
1221///
1222/// Byte-level check for use in byte-scanning fast paths (see B2
1223/// in benchmark-optimizations.md). Non-ASCII bytes (>=0x80)
1224/// always return false, which is correct since GraphQL names are
1225/// ASCII-only by spec.
1226#[inline]
1227fn is_name_continue_byte(b: u8) -> bool {
1228    b == b'_' || b.is_ascii_alphanumeric()
1229}
1230
1231/// Computes (utf8_char_count, utf16_code_unit_count) for a
1232/// string slice. Used by byte-scanning fast paths to batch-
1233/// compute column advancement after scanning a range.
1234///
1235/// Has an ASCII fast path: when all bytes are ASCII (the common
1236/// case for GraphQL source text), the byte count equals both the
1237/// UTF-8 char count and the UTF-16 code unit count, so no
1238/// per-character iteration is needed.
1239///
1240/// See B2 in benchmark-optimizations.md.
1241fn compute_columns_for_span(s: &str) -> (usize, usize) {
1242    if s.is_ascii() {
1243        let len = s.len();
1244        (len, len)
1245    } else {
1246        let mut utf8_col: usize = 0;
1247        let mut utf16_col: usize = 0;
1248        for ch in s.chars() {
1249            utf8_col += 1;
1250            utf16_col += ch.len_utf16();
1251        }
1252        (utf8_col, utf16_col)
1253    }
1254}
1255
1256/// Returns a human-readable description of a character for error messages.
1257///
1258/// For printable characters, returns the character in backticks.
1259/// For invisible/control characters, includes Unicode code point description.
1260fn describe_char(ch: char) -> String {
1261    if ch.is_control() || (ch.is_whitespace() && ch != ' ') {
1262        // Invisible characters get detailed description
1263        let name = unicode_char_name(ch);
1264        if let Some(name) = name {
1265            format!("`{}` (U+{:04X}: {})", ch, ch as u32, name)
1266        } else {
1267            format!("`{}` (U+{:04X})", ch, ch as u32)
1268        }
1269    } else {
1270        format!("`{ch}`")
1271    }
1272}
1273
1274/// Returns the Unicode name for well-known invisible/control characters.
1275///
1276/// This provides meaningful names for commonly encountered invisible
1277/// characters. Returns `None` for characters without a known name.
1278fn unicode_char_name(ch: char) -> Option<&'static str> {
1279    match ch {
1280        // C0 control characters (U+0000 - U+001F)
1281        '\u{0000}' => Some("NULL"),
1282        '\u{0001}' => Some("START OF HEADING"),
1283        '\u{0002}' => Some("START OF TEXT"),
1284        '\u{0003}' => Some("END OF TEXT"),
1285        '\u{0004}' => Some("END OF TRANSMISSION"),
1286        '\u{0005}' => Some("ENQUIRY"),
1287        '\u{0006}' => Some("ACKNOWLEDGE"),
1288        '\u{0007}' => Some("BELL"),
1289        '\u{0008}' => Some("BACKSPACE"),
1290        '\u{0009}' => Some("HORIZONTAL TAB"),
1291        '\u{000A}' => Some("LINE FEED"),
1292        '\u{000B}' => Some("VERTICAL TAB"),
1293        '\u{000C}' => Some("FORM FEED"),
1294        '\u{000D}' => Some("CARRIAGE RETURN"),
1295        '\u{000E}' => Some("SHIFT OUT"),
1296        '\u{000F}' => Some("SHIFT IN"),
1297        '\u{0010}' => Some("DATA LINK ESCAPE"),
1298        '\u{0011}' => Some("DEVICE CONTROL ONE"),
1299        '\u{0012}' => Some("DEVICE CONTROL TWO"),
1300        '\u{0013}' => Some("DEVICE CONTROL THREE"),
1301        '\u{0014}' => Some("DEVICE CONTROL FOUR"),
1302        '\u{0015}' => Some("NEGATIVE ACKNOWLEDGE"),
1303        '\u{0016}' => Some("SYNCHRONOUS IDLE"),
1304        '\u{0017}' => Some("END OF TRANSMISSION BLOCK"),
1305        '\u{0018}' => Some("CANCEL"),
1306        '\u{0019}' => Some("END OF MEDIUM"),
1307        '\u{001A}' => Some("SUBSTITUTE"),
1308        '\u{001B}' => Some("ESCAPE"),
1309        '\u{001C}' => Some("FILE SEPARATOR"),
1310        '\u{001D}' => Some("GROUP SEPARATOR"),
1311        '\u{001E}' => Some("RECORD SEPARATOR"),
1312        '\u{001F}' => Some("UNIT SEPARATOR"),
1313
1314        // C1 control characters and special (U+007F - U+00A0)
1315        '\u{007F}' => Some("DELETE"),
1316        '\u{0080}' => Some("PADDING CHARACTER"),
1317        '\u{0081}' => Some("HIGH OCTET PRESET"),
1318        '\u{0082}' => Some("BREAK PERMITTED HERE"),
1319        '\u{0083}' => Some("NO BREAK HERE"),
1320        '\u{0084}' => Some("INDEX"),
1321        '\u{0085}' => Some("NEXT LINE"),
1322        '\u{0086}' => Some("START OF SELECTED AREA"),
1323        '\u{0087}' => Some("END OF SELECTED AREA"),
1324        '\u{0088}' => Some("CHARACTER TABULATION SET"),
1325        '\u{0089}' => Some("CHARACTER TABULATION WITH JUSTIFICATION"),
1326        '\u{008A}' => Some("LINE TABULATION SET"),
1327        '\u{008B}' => Some("PARTIAL LINE FORWARD"),
1328        '\u{008C}' => Some("PARTIAL LINE BACKWARD"),
1329        '\u{008D}' => Some("REVERSE LINE FEED"),
1330        '\u{008E}' => Some("SINGLE SHIFT TWO"),
1331        '\u{008F}' => Some("SINGLE SHIFT THREE"),
1332        '\u{0090}' => Some("DEVICE CONTROL STRING"),
1333        '\u{0091}' => Some("PRIVATE USE ONE"),
1334        '\u{0092}' => Some("PRIVATE USE TWO"),
1335        '\u{0093}' => Some("SET TRANSMIT STATE"),
1336        '\u{0094}' => Some("CANCEL CHARACTER"),
1337        '\u{0095}' => Some("MESSAGE WAITING"),
1338        '\u{0096}' => Some("START OF GUARDED AREA"),
1339        '\u{0097}' => Some("END OF GUARDED AREA"),
1340        '\u{0098}' => Some("START OF STRING"),
1341        '\u{0099}' => Some("SINGLE GRAPHIC CHARACTER INTRODUCER"),
1342        '\u{009A}' => Some("SINGLE CHARACTER INTRODUCER"),
1343        '\u{009B}' => Some("CONTROL SEQUENCE INTRODUCER"),
1344        '\u{009C}' => Some("STRING TERMINATOR"),
1345        '\u{009D}' => Some("OPERATING SYSTEM COMMAND"),
1346        '\u{009E}' => Some("PRIVACY MESSAGE"),
1347        '\u{009F}' => Some("APPLICATION PROGRAM COMMAND"),
1348        '\u{00A0}' => Some("NO-BREAK SPACE"),
1349        '\u{00AD}' => Some("SOFT HYPHEN"),
1350
1351        // General punctuation - spaces (U+2000 - U+200A)
1352        '\u{2000}' => Some("EN QUAD"),
1353        '\u{2001}' => Some("EM QUAD"),
1354        '\u{2002}' => Some("EN SPACE"),
1355        '\u{2003}' => Some("EM SPACE"),
1356        '\u{2004}' => Some("THREE-PER-EM SPACE"),
1357        '\u{2005}' => Some("FOUR-PER-EM SPACE"),
1358        '\u{2006}' => Some("SIX-PER-EM SPACE"),
1359        '\u{2007}' => Some("FIGURE SPACE"),
1360        '\u{2008}' => Some("PUNCTUATION SPACE"),
1361        '\u{2009}' => Some("THIN SPACE"),
1362        '\u{200A}' => Some("HAIR SPACE"),
1363
1364        // Zero-width and formatting characters (U+200B - U+200F)
1365        '\u{200B}' => Some("ZERO WIDTH SPACE"),
1366        '\u{200C}' => Some("ZERO WIDTH NON-JOINER"),
1367        '\u{200D}' => Some("ZERO WIDTH JOINER"),
1368        '\u{200E}' => Some("LEFT-TO-RIGHT MARK"),
1369        '\u{200F}' => Some("RIGHT-TO-LEFT MARK"),
1370
1371        // Bidirectional text formatting (U+202A - U+202F)
1372        '\u{202A}' => Some("LEFT-TO-RIGHT EMBEDDING"),
1373        '\u{202B}' => Some("RIGHT-TO-LEFT EMBEDDING"),
1374        '\u{202C}' => Some("POP DIRECTIONAL FORMATTING"),
1375        '\u{202D}' => Some("LEFT-TO-RIGHT OVERRIDE"),
1376        '\u{202E}' => Some("RIGHT-TO-LEFT OVERRIDE"),
1377        '\u{202F}' => Some("NARROW NO-BREAK SPACE"),
1378
1379        // More formatting (U+2060 - U+206F)
1380        '\u{2060}' => Some("WORD JOINER"),
1381        '\u{2061}' => Some("FUNCTION APPLICATION"),
1382        '\u{2062}' => Some("INVISIBLE TIMES"),
1383        '\u{2063}' => Some("INVISIBLE SEPARATOR"),
1384        '\u{2064}' => Some("INVISIBLE PLUS"),
1385        '\u{2066}' => Some("LEFT-TO-RIGHT ISOLATE"),
1386        '\u{2067}' => Some("RIGHT-TO-LEFT ISOLATE"),
1387        '\u{2068}' => Some("FIRST STRONG ISOLATE"),
1388        '\u{2069}' => Some("POP DIRECTIONAL ISOLATE"),
1389        '\u{206A}' => Some("INHIBIT SYMMETRIC SWAPPING"),
1390        '\u{206B}' => Some("ACTIVATE SYMMETRIC SWAPPING"),
1391        '\u{206C}' => Some("INHIBIT ARABIC FORM SHAPING"),
1392        '\u{206D}' => Some("ACTIVATE ARABIC FORM SHAPING"),
1393        '\u{206E}' => Some("NATIONAL DIGIT SHAPES"),
1394        '\u{206F}' => Some("NOMINAL DIGIT SHAPES"),
1395
1396        // Other special spaces
1397        '\u{2028}' => Some("LINE SEPARATOR"),
1398        '\u{2029}' => Some("PARAGRAPH SEPARATOR"),
1399        '\u{205F}' => Some("MEDIUM MATHEMATICAL SPACE"),
1400        '\u{3000}' => Some("IDEOGRAPHIC SPACE"),
1401
1402        // Special characters
1403        '\u{034F}' => Some("COMBINING GRAPHEME JOINER"),
1404        '\u{061C}' => Some("ARABIC LETTER MARK"),
1405        '\u{115F}' => Some("HANGUL CHOSEONG FILLER"),
1406        '\u{1160}' => Some("HANGUL JUNGSEONG FILLER"),
1407        '\u{17B4}' => Some("KHMER VOWEL INHERENT AQ"),
1408        '\u{17B5}' => Some("KHMER VOWEL INHERENT AA"),
1409        '\u{180E}' => Some("MONGOLIAN VOWEL SEPARATOR"),
1410
1411        // BOM and special
1412        '\u{FEFF}' => Some("BYTE ORDER MARK"),
1413        '\u{FFFE}' => Some("NONCHARACTER"),
1414        '\u{FFFF}' => Some("NONCHARACTER"),
1415
1416        // Interlinear annotation
1417        '\u{FFF9}' => Some("INTERLINEAR ANNOTATION ANCHOR"),
1418        '\u{FFFA}' => Some("INTERLINEAR ANNOTATION SEPARATOR"),
1419        '\u{FFFB}' => Some("INTERLINEAR ANNOTATION TERMINATOR"),
1420
1421        // Tag characters (U+E0000 - U+E007F)
1422        '\u{E0001}' => Some("LANGUAGE TAG"),
1423        '\u{E0020}' => Some("TAG SPACE"),
1424
1425        _ => None,
1426    }
1427}