Skip to main content

logicaffeine_language/
lexer.rs

1//! Two-stage lexer for LOGOS natural language input.
2//!
3//! The lexer transforms natural language text into a token stream suitable
4//! for parsing. It operates in two stages:
5//!
6//! ## Stage 1: Line Lexer
7//!
8//! The [`LineLexer`] handles structural concerns:
9//!
10//! - **Indentation**: Tracks indent levels, emits `Indent`/`Dedent` tokens
11//! - **Block boundaries**: Identifies significant whitespace
12//! - **Content extraction**: Passes line content to Stage 2
13//!
14//! ## Stage 2: Word Lexer
15//!
16//! The [`Lexer`] performs word-level tokenization:
17//!
18//! - **Vocabulary lookup**: Identifies words via the lexicon database
19//! - **Morphological analysis**: Handles inflection (verb tenses, plurals)
20//! - **Ambiguity resolution**: Uses priority rules for ambiguous words
21//!
22//! ## Ambiguity Rules
23//!
24//! When a word matches multiple lexicon entries, priority determines the token:
25//!
26//! 1. **Quantifiers** over nouns ("some" → Quantifier, not Noun)
27//! 2. **Determiners** over adjectives ("the" → Determiner, not Adjective)
28//! 3. **Verbs** over nouns for -ing/-ed forms ("running" → Verb)
29//!
30//! ## Example
31//!
32//! ```text
33//! Input:  "Every cat sleeps."
34//! Output: [Quantifier("every"), Noun("cat"), Verb("sleeps"), Period]
35//! ```
36
37use logicaffeine_base::Interner;
38use crate::lexicon::{self, Aspect, Definiteness, Lexicon, Time};
39use crate::token::{BlockType, CalendarUnit, FocusKind, MeasureKind, Span, Token, TokenType};
40
41// ============================================================================
42// Stage 1: Line Lexer (Spec §2.5.2)
43// ============================================================================
44
45/// Tokens emitted by the LineLexer (Stage 1).
46/// Handles structural tokens (Indent, Dedent, Newline) while treating
47/// all other content as opaque for Stage 2 word classification.
48#[derive(Debug, Clone, PartialEq)]
49pub enum LineToken {
50    /// Block increased indentation
51    Indent,
52    /// Block decreased indentation
53    Dedent,
54    /// Logical newline (statement boundary) - reserved for future use
55    Newline,
56    /// Content to be further tokenized (line content, trimmed)
57    Content { text: String, start: usize, end: usize },
58}
59
60/// Stage 1 Lexer: Handles only lines, indentation, and structural tokens.
61/// Treats all other text as opaque `Content` for the Stage 2 WordLexer.
62pub struct LineLexer<'a> {
63    source: &'a str,
64    bytes: &'a [u8],
65    indent_stack: Vec<usize>,
66    pending_dedents: usize,
67    position: usize,
68    /// True if we need to emit Content for current line
69    has_pending_content: bool,
70    pending_content_start: usize,
71    pending_content_end: usize,
72    pending_content_text: String,
73    /// True after we've finished processing all lines
74    finished_lines: bool,
75    /// True if we've emitted at least one Indent (need to emit Dedents at EOF)
76    emitted_indent: bool,
77    /// Escape block body byte ranges to skip (start_byte, end_byte)
78    escape_body_ranges: Vec<(usize, usize)>,
79}
80
81impl<'a> LineLexer<'a> {
82    pub fn new(source: &'a str) -> Self {
83        Self {
84            source,
85            bytes: source.as_bytes(),
86            indent_stack: vec![0],
87            pending_dedents: 0,
88            position: 0,
89            has_pending_content: false,
90            pending_content_start: 0,
91            pending_content_end: 0,
92            pending_content_text: String::new(),
93            finished_lines: false,
94            emitted_indent: false,
95            escape_body_ranges: Vec::new(),
96        }
97    }
98
99    pub fn with_escape_ranges(source: &'a str, escape_body_ranges: Vec<(usize, usize)>) -> Self {
100        Self {
101            source,
102            bytes: source.as_bytes(),
103            indent_stack: vec![0],
104            pending_dedents: 0,
105            position: 0,
106            has_pending_content: false,
107            pending_content_start: 0,
108            pending_content_end: 0,
109            pending_content_text: String::new(),
110            finished_lines: false,
111            emitted_indent: false,
112            escape_body_ranges,
113        }
114    }
115
116    /// Check if a byte position falls within an escape body range.
117    fn is_in_escape_body(&self, pos: usize) -> bool {
118        self.escape_body_ranges.iter().any(|(start, end)| pos >= *start && pos < *end)
119    }
120
121    /// Calculate indentation level at current position (at start of line).
122    /// Returns (indent_level, content_start_pos).
123    fn measure_indent(&self, line_start: usize) -> (usize, usize) {
124        let mut indent = 0;
125        let mut pos = line_start;
126
127        while pos < self.bytes.len() {
128            match self.bytes[pos] {
129                b' ' => {
130                    indent += 1;
131                    pos += 1;
132                }
133                b'\t' => {
134                    indent += 4; // Tab = 4 spaces
135                    pos += 1;
136                }
137                _ => break,
138            }
139        }
140
141        (indent, pos)
142    }
143
144    /// Read content from current position until end of line or EOF.
145    /// Returns (content_text, content_start, content_end, next_line_start).
146    fn read_line_content(&self, content_start: usize) -> (String, usize, usize, usize) {
147        let mut pos = content_start;
148
149        // Find end of line
150        while pos < self.bytes.len() && self.bytes[pos] != b'\n' {
151            pos += 1;
152        }
153
154        let content_end = pos;
155        let text = self.source[content_start..content_end].trim_end().to_string();
156
157        // Move past newline if present
158        let next_line_start = if pos < self.bytes.len() && self.bytes[pos] == b'\n' {
159            pos + 1
160        } else {
161            pos
162        };
163
164        (text, content_start, content_end, next_line_start)
165    }
166
167    /// Check if the line starting at `pos` is blank (only whitespace).
168    fn is_blank_line(&self, line_start: usize) -> bool {
169        let mut pos = line_start;
170        while pos < self.bytes.len() {
171            match self.bytes[pos] {
172                b' ' | b'\t' => pos += 1,
173                b'\n' => return true,
174                _ => return false,
175            }
176        }
177        true // EOF counts as blank
178    }
179
180    /// Process the next line and update internal state.
181    /// Returns true if we have tokens to emit, false if we're done.
182    fn process_next_line(&mut self) -> bool {
183        // Skip blank lines
184        while self.position < self.bytes.len() && self.is_blank_line(self.position) {
185            // Skip to next line
186            while self.position < self.bytes.len() && self.bytes[self.position] != b'\n' {
187                self.position += 1;
188            }
189            if self.position < self.bytes.len() {
190                self.position += 1; // Skip the newline
191            }
192        }
193
194        // Check if we've reached EOF
195        if self.position >= self.bytes.len() {
196            self.finished_lines = true;
197            // Emit remaining dedents at EOF
198            if self.indent_stack.len() > 1 {
199                self.pending_dedents = self.indent_stack.len() - 1;
200                self.indent_stack.truncate(1);
201            }
202            return self.pending_dedents > 0;
203        }
204
205        // Measure indentation of current line
206        let (line_indent, content_start) = self.measure_indent(self.position);
207
208        // Read line content
209        let (text, start, end, next_pos) = self.read_line_content(content_start);
210
211        // Skip if content is empty (shouldn't happen after blank line skip, but be safe)
212        if text.is_empty() {
213            self.position = next_pos;
214            return self.process_next_line();
215        }
216
217        let current_indent = *self.indent_stack.last().unwrap();
218
219        // Handle indentation changes
220        if line_indent > current_indent {
221            // Indent: push new level
222            self.indent_stack.push(line_indent);
223            self.emitted_indent = true;
224            // Store content to emit after Indent
225            self.has_pending_content = true;
226            self.pending_content_text = text;
227            self.pending_content_start = start;
228            self.pending_content_end = end;
229            self.position = next_pos;
230            // We'll emit Indent first, then Content
231            return true;
232        } else if line_indent < current_indent {
233            // Dedent: pop until we match
234            while self.indent_stack.len() > 1 {
235                let top = *self.indent_stack.last().unwrap();
236                if line_indent < top {
237                    self.indent_stack.pop();
238                    self.pending_dedents += 1;
239                } else {
240                    break;
241                }
242            }
243            // Store content to emit after Dedents
244            self.has_pending_content = true;
245            self.pending_content_text = text;
246            self.pending_content_start = start;
247            self.pending_content_end = end;
248            self.position = next_pos;
249            return true;
250        } else {
251            // Same indentation level
252            self.has_pending_content = true;
253            self.pending_content_text = text;
254            self.pending_content_start = start;
255            self.pending_content_end = end;
256            self.position = next_pos;
257            return true;
258        }
259    }
260}
261
262impl<'a> Iterator for LineLexer<'a> {
263    type Item = LineToken;
264
265    fn next(&mut self) -> Option<LineToken> {
266        // 1. Emit pending dedents first
267        if self.pending_dedents > 0 {
268            self.pending_dedents -= 1;
269            return Some(LineToken::Dedent);
270        }
271
272        // 2. Emit pending content
273        if self.has_pending_content {
274            self.has_pending_content = false;
275            let text = std::mem::take(&mut self.pending_content_text);
276            let start = self.pending_content_start;
277            let end = self.pending_content_end;
278            return Some(LineToken::Content { text, start, end });
279        }
280
281        // 3. Check if we need to emit Indent (after pushing to stack)
282        // This happens when we detected an indent but haven't emitted the token yet
283        // We need to check if indent_stack was just modified
284
285        // 4. Process next line
286        if !self.finished_lines {
287            let had_indent = self.indent_stack.len();
288            if self.process_next_line() {
289                // Check if we added an indent level
290                if self.indent_stack.len() > had_indent {
291                    return Some(LineToken::Indent);
292                }
293                // Check if we have pending dedents
294                if self.pending_dedents > 0 {
295                    self.pending_dedents -= 1;
296                    return Some(LineToken::Dedent);
297                }
298                // Otherwise emit content
299                if self.has_pending_content {
300                    self.has_pending_content = false;
301                    let text = std::mem::take(&mut self.pending_content_text);
302                    let start = self.pending_content_start;
303                    let end = self.pending_content_end;
304                    return Some(LineToken::Content { text, start, end });
305                }
306            } else if self.pending_dedents > 0 {
307                // EOF with pending dedents
308                self.pending_dedents -= 1;
309                return Some(LineToken::Dedent);
310            }
311        }
312
313        // 5. Emit any remaining dedents at EOF
314        if self.pending_dedents > 0 {
315            self.pending_dedents -= 1;
316            return Some(LineToken::Dedent);
317        }
318
319        None
320    }
321}
322
323// ============================================================================
324// Stage 2: Word Lexer (existing Lexer)
325// ============================================================================
326
327#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
328pub enum LexerMode {
329    #[default]
330    Declarative, // Logic, Theorems, Definitions
331    Imperative,  // Main, Functions, Code
332}
333
334pub struct Lexer<'a> {
335    words: Vec<WordItem>,
336    pos: usize,
337    lexicon: Lexicon,
338    interner: &'a mut Interner,
339    input_len: usize,
340    in_let_context: bool,
341    mode: LexerMode,
342    source: String,
343    /// Escape block body byte ranges: (skip_start, skip_end) for filtering LineLexer events
344    escape_body_ranges: Vec<(usize, usize)>,
345}
346
347struct WordItem {
348    word: String,
349    trailing_punct: Option<char>,
350    start: usize,
351    end: usize,
352    punct_pos: Option<usize>,
353}
354
355impl<'a> Lexer<'a> {
356    /// Creates a new lexer for the given input text.
357    ///
358    /// The lexer will tokenize natural language text according to the
359    /// lexicon database, performing morphological analysis and ambiguity
360    /// resolution.
361    ///
362    /// # Arguments
363    ///
364    /// * `input` - The natural language text to tokenize
365    /// * `interner` - String interner for efficient symbol handling
366    ///
367    /// # Example
368    ///
369    /// ```
370    /// use logicaffeine_language::lexer::Lexer;
371    /// use logicaffeine_base::Interner;
372    ///
373    /// let mut interner = Interner::new();
374    /// let mut lexer = Lexer::new("Every cat sleeps.", &mut interner);
375    /// let tokens = lexer.tokenize();
376    ///
377    /// assert_eq!(tokens.len(), 5); // Quantifier, Noun, Verb, Period, EOI
378    /// ```
379    pub fn new(input: &str, interner: &'a mut Interner) -> Self {
380        let escape_ranges = Self::find_escape_block_ranges(input);
381        let escape_body_ranges: Vec<(usize, usize)> = escape_ranges.iter()
382            .map(|(_, end, content_start, _)| (*content_start, *end))
383            .collect();
384        let words = Self::split_into_words(input, &escape_ranges);
385        let input_len = input.len();
386
387        Lexer {
388            words,
389            pos: 0,
390            lexicon: Lexicon::new(),
391            interner,
392            input_len,
393            in_let_context: false,
394            mode: LexerMode::Declarative,
395            source: input.to_string(),
396            escape_body_ranges,
397        }
398    }
399
400    /// Pre-scan source text for escape block bodies.
401    /// Returns (skip_start_byte, skip_end_byte, content_start_byte, raw_code) tuples.
402    /// `skip_start` is the line start (for byte skipping in split_into_words).
403    /// `content_start` is after leading whitespace (for token span alignment with Indent events).
404    fn find_escape_block_ranges(source: &str) -> Vec<(usize, usize, usize, String)> {
405        let mut ranges = Vec::new();
406        let lines: Vec<&str> = source.split('\n').collect();
407        let mut line_starts: Vec<usize> = Vec::with_capacity(lines.len());
408        let mut pos = 0;
409        for line in &lines {
410            line_starts.push(pos);
411            pos += line.len() + 1; // +1 for the newline
412        }
413
414        let mut i = 0;
415        while i < lines.len() {
416            let trimmed = lines[i].trim();
417            // Check if this line contains an escape header: "Escape to Rust:"
418            // Matches both statement position (whole line) and expression position
419            // (e.g., "Let x: Int be Escape to Rust:")
420            let lower = trimmed.to_lowercase();
421            if lower == "escape to rust:" ||
422               lower.ends_with(" escape to rust:") ||
423               (lower.starts_with("escape to ") && lower.ends_with(':'))
424            {
425                // Find the body: subsequent lines with deeper indentation
426                let header_indent = Self::measure_indent_static(lines[i]);
427                i += 1;
428
429                // Skip blank lines to find the first body line
430                let mut body_start_line = i;
431                while body_start_line < lines.len() && lines[body_start_line].trim().is_empty() {
432                    body_start_line += 1;
433                }
434
435                if body_start_line >= lines.len() {
436                    // No body found
437                    continue;
438                }
439
440                let base_indent = Self::measure_indent_static(lines[body_start_line]);
441                if base_indent <= header_indent {
442                    // No indented body
443                    continue;
444                }
445
446                // Capture all lines at base_indent or deeper
447                let body_byte_start = line_starts[body_start_line];
448                let mut body_end_line = body_start_line;
449                let mut code_lines: Vec<String> = Vec::new();
450
451                let mut j = body_start_line;
452                while j < lines.len() {
453                    let line = lines[j];
454                    if line.trim().is_empty() {
455                        // Blank lines are preserved
456                        code_lines.push(String::new());
457                        body_end_line = j;
458                        j += 1;
459                        continue;
460                    }
461                    let line_indent = Self::measure_indent_static(line);
462                    if line_indent < base_indent {
463                        break;
464                    }
465                    // Strip base indentation
466                    let stripped = Self::strip_indent(line, base_indent);
467                    code_lines.push(stripped);
468                    body_end_line = j;
469                    j += 1;
470                }
471
472                // Trim trailing empty lines from code
473                while code_lines.last().map_or(false, |l| l.is_empty()) {
474                    code_lines.pop();
475                }
476
477                if !code_lines.is_empty() {
478                    let body_byte_end = if body_end_line + 1 < lines.len() {
479                        line_starts[body_end_line + 1]
480                    } else {
481                        source.len()
482                    };
483                    // Compute content start (after leading whitespace of first body line)
484                    let content_start = body_byte_start + Self::leading_whitespace_bytes(lines[body_start_line]);
485                    let raw_code = code_lines.join("\n");
486                    ranges.push((body_byte_start, body_byte_end, content_start, raw_code));
487                }
488
489                i = j;
490            } else {
491                i += 1;
492            }
493        }
494
495        ranges
496    }
497
498    /// Count leading whitespace bytes in a line.
499    fn leading_whitespace_bytes(line: &str) -> usize {
500        let mut count = 0;
501        for c in line.chars() {
502            match c {
503                ' ' | '\t' => count += c.len_utf8(),
504                _ => break,
505            }
506        }
507        count
508    }
509
510    /// Measure indent of a line (static helper for pre-scan).
511    fn measure_indent_static(line: &str) -> usize {
512        let mut indent = 0;
513        for c in line.chars() {
514            match c {
515                ' ' => indent += 1,
516                '\t' => indent += 4,
517                _ => break,
518            }
519        }
520        indent
521    }
522
523    /// Strip `count` leading spaces/tabs from a line.
524    fn strip_indent(line: &str, count: usize) -> String {
525        let mut stripped = 0;
526        let mut byte_pos = 0;
527        for (i, c) in line.char_indices() {
528            if stripped >= count {
529                byte_pos = i;
530                break;
531            }
532            match c {
533                ' ' => { stripped += 1; byte_pos = i + 1; }
534                '\t' => { stripped += 4; byte_pos = i + 1; }
535                _ => { byte_pos = i; break; }
536            }
537        }
538        if stripped < count {
539            byte_pos = line.len();
540        }
541        line[byte_pos..].to_string()
542    }
543
544    fn split_into_words(input: &str, escape_ranges: &[(usize, usize, usize, String)]) -> Vec<WordItem> {
545        let mut items = Vec::new();
546        let mut current_word = String::new();
547        let mut word_start = 0;
548        let chars: Vec<char> = input.chars().collect();
549        let mut char_idx = 0;
550        let mut skip_count = 0;
551        // Track byte offset for escape range matching
552        let mut skip_to_byte: Option<usize> = None;
553
554        for (i, c) in input.char_indices() {
555            if skip_count > 0 {
556                skip_count -= 1;
557                char_idx += 1;
558                continue;
559            }
560            // Skip bytes inside escape block bodies
561            if let Some(end) = skip_to_byte {
562                if i < end {
563                    char_idx += 1;
564                    continue;
565                }
566                skip_to_byte = None;
567                word_start = i;
568            }
569            // Check if this byte position starts an escape block body
570            if let Some((_, end, content_start, raw_code)) = escape_ranges.iter().find(|(s, _, _, _)| i == *s) {
571                // Flush any pending word
572                if !current_word.is_empty() {
573                    items.push(WordItem {
574                        word: std::mem::take(&mut current_word),
575                        trailing_punct: None,
576                        start: word_start,
577                        end: i,
578                        punct_pos: None,
579                    });
580                }
581                // Emit the entire block as a single \x00ESC: marker
582                // Use content_start (after whitespace) for span alignment with Indent events
583                items.push(WordItem {
584                    word: format!("\x00ESC:{}", raw_code),
585                    trailing_punct: None,
586                    start: *content_start,
587                    end: *end,
588                    punct_pos: None,
589                });
590                skip_to_byte = Some(*end);
591                word_start = *end;
592                char_idx += 1;
593                continue;
594            }
595            let next_pos = i + c.len_utf8();
596            match c {
597                ' ' | '\t' | '\n' | '\r' => {
598                    if !current_word.is_empty() {
599                        items.push(WordItem {
600                            word: std::mem::take(&mut current_word),
601                            trailing_punct: None,
602                            start: word_start,
603                            end: i,
604                            punct_pos: None,
605                        });
606                    }
607                    word_start = next_pos;
608                }
609                '.' => {
610                    // Check if this is a decimal point (digit before and after)
611                    let prev_is_digit = !current_word.is_empty()
612                        && current_word.chars().last().map_or(false, |ch| ch.is_ascii_digit());
613                    let next_is_digit = char_idx + 1 < chars.len()
614                        && chars[char_idx + 1].is_ascii_digit();
615
616                    if prev_is_digit && next_is_digit {
617                        // This is a decimal point, include it in the current word
618                        current_word.push(c);
619                    } else {
620                        // This is a sentence period
621                        if !current_word.is_empty() {
622                            items.push(WordItem {
623                                word: std::mem::take(&mut current_word),
624                                trailing_punct: Some(c),
625                                start: word_start,
626                                end: i,
627                                punct_pos: Some(i),
628                            });
629                        } else {
630                            items.push(WordItem {
631                                word: String::new(),
632                                trailing_punct: Some(c),
633                                start: i,
634                                end: next_pos,
635                                punct_pos: Some(i),
636                            });
637                        }
638                        word_start = next_pos;
639                    }
640                }
641                '#' => {
642                    // Check for ## block header (markdown-style)
643                    if char_idx + 1 < chars.len() && chars[char_idx + 1] == '#' {
644                        // This is a ## block header
645                        // Skip the second # and capture the next word as a block header
646                        if !current_word.is_empty() {
647                            items.push(WordItem {
648                                word: std::mem::take(&mut current_word),
649                                trailing_punct: None,
650                                start: word_start,
651                                end: i,
652                                punct_pos: None,
653                            });
654                        }
655                        // Skip whitespace after ##
656                        let header_start = i;
657                        let mut j = char_idx + 2;
658                        while j < chars.len() && (chars[j] == ' ' || chars[j] == '\t') {
659                            j += 1;
660                        }
661                        // Capture the block type word
662                        let mut block_word = String::from("##");
663                        while j < chars.len() && chars[j].is_alphabetic() {
664                            block_word.push(chars[j]);
665                            j += 1;
666                        }
667                        if block_word.len() > 2 {
668                            items.push(WordItem {
669                                word: block_word,
670                                trailing_punct: None,
671                                start: header_start,
672                                end: header_start + (j - char_idx),
673                                punct_pos: None,
674                            });
675                        }
676                        skip_count = j - char_idx - 1;
677                        word_start = header_start + (j - char_idx);
678                    } else {
679                        // Single # - treat as comment, skip to end of line
680                        // Count how many chars to skip (without modifying char_idx here -
681                        // the main loop's skip handler will increment it)
682                        let mut look_ahead = char_idx + 1;
683                        while look_ahead < chars.len() && chars[look_ahead] != '\n' {
684                            skip_count += 1;
685                            look_ahead += 1;
686                        }
687                        if !current_word.is_empty() {
688                            items.push(WordItem {
689                                word: std::mem::take(&mut current_word),
690                                trailing_punct: None,
691                                start: word_start,
692                                end: i,
693                                punct_pos: None,
694                            });
695                        }
696                        word_start = look_ahead + 1; // Start after the newline
697                    }
698                }
699                // String literals: "hello world"
700                '"' => {
701                    // Push any pending word
702                    if !current_word.is_empty() {
703                        items.push(WordItem {
704                            word: std::mem::take(&mut current_word),
705                            trailing_punct: None,
706                            start: word_start,
707                            end: i,
708                            punct_pos: None,
709                        });
710                    }
711
712                    // Scan until closing quote
713                    let string_start = i;
714                    let mut j = char_idx + 1;
715                    let mut string_content = String::new();
716                    while j < chars.len() && chars[j] != '"' {
717                        if chars[j] == '\\' && j + 1 < chars.len() {
718                            // Escape sequence - skip backslash, include next char
719                            j += 1;
720                            if j < chars.len() {
721                                string_content.push(chars[j]);
722                            }
723                        } else {
724                            string_content.push(chars[j]);
725                        }
726                        j += 1;
727                    }
728
729                    // Create a special marker for string literals
730                    // We prefix with a special character to identify in tokenize()
731                    items.push(WordItem {
732                        word: format!("\x00STR:{}", string_content),
733                        trailing_punct: None,
734                        start: string_start,
735                        end: if j < chars.len() { j + 1 } else { j },
736                        punct_pos: None,
737                    });
738
739                    // Skip past the closing quote
740                    if j < chars.len() {
741                        skip_count = j - char_idx;
742                    } else {
743                        skip_count = j - char_idx - 1;
744                    }
745                    word_start = if j < chars.len() { j + 1 } else { j };
746                }
747                // Character literals with backticks: `x`
748                '`' => {
749                    // Push any pending word
750                    if !current_word.is_empty() {
751                        items.push(WordItem {
752                            word: std::mem::take(&mut current_word),
753                            trailing_punct: None,
754                            start: word_start,
755                            end: i,
756                            punct_pos: None,
757                        });
758                    }
759
760                    // Scan for character content and closing backtick
761                    let char_start = i;
762                    let mut j = char_idx + 1;
763                    let mut char_content = String::new();
764
765                    if j < chars.len() {
766                        if chars[j] == '\\' && j + 1 < chars.len() {
767                            // Escape sequence
768                            j += 1;
769                            let escaped_char = match chars[j] {
770                                'n' => '\n',
771                                't' => '\t',
772                                'r' => '\r',
773                                '\\' => '\\',
774                                '`' => '`',
775                                '0' => '\0',
776                                c => c,
777                            };
778                            char_content.push(escaped_char);
779                            j += 1;
780                        } else if chars[j] != '`' {
781                            // Regular character
782                            char_content.push(chars[j]);
783                            j += 1;
784                        }
785                    }
786
787                    // Expect closing backtick
788                    if j < chars.len() && chars[j] == '`' {
789                        j += 1; // skip closing backtick
790                    }
791
792                    // Create a special marker for char literals
793                    items.push(WordItem {
794                        word: format!("\x00CHAR:{}", char_content),
795                        trailing_punct: None,
796                        start: char_start,
797                        end: if j <= chars.len() { char_start + (j - char_idx) } else { char_start + 1 },
798                        punct_pos: None,
799                    });
800
801                    if j > char_idx + 1 {
802                        skip_count = j - char_idx - 1;
803                    }
804                    word_start = char_start + (j - char_idx);
805                }
806                // Handle -> as a single token for return type syntax
807                '-' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '>' => {
808                    // Push any pending word first
809                    if !current_word.is_empty() {
810                        items.push(WordItem {
811                            word: std::mem::take(&mut current_word),
812                            trailing_punct: None,
813                            start: word_start,
814                            end: i,
815                            punct_pos: None,
816                        });
817                    }
818                    // Push -> as its own word
819                    items.push(WordItem {
820                        word: "->".to_string(),
821                        trailing_punct: None,
822                        start: i,
823                        end: i + 2,
824                        punct_pos: None,
825                    });
826                    skip_count = 1; // Skip the '>' character
827                    word_start = i + 2;
828                }
829                // Grand Challenge: Handle <= as a single token
830                '<' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
831                    if !current_word.is_empty() {
832                        items.push(WordItem {
833                            word: std::mem::take(&mut current_word),
834                            trailing_punct: None,
835                            start: word_start,
836                            end: i,
837                            punct_pos: None,
838                        });
839                    }
840                    items.push(WordItem {
841                        word: "<=".to_string(),
842                        trailing_punct: None,
843                        start: i,
844                        end: i + 2,
845                        punct_pos: None,
846                    });
847                    skip_count = 1;
848                    word_start = i + 2;
849                }
850                // Grand Challenge: Handle >= as a single token
851                '>' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
852                    if !current_word.is_empty() {
853                        items.push(WordItem {
854                            word: std::mem::take(&mut current_word),
855                            trailing_punct: None,
856                            start: word_start,
857                            end: i,
858                            punct_pos: None,
859                        });
860                    }
861                    items.push(WordItem {
862                        word: ">=".to_string(),
863                        trailing_punct: None,
864                        start: i,
865                        end: i + 2,
866                        punct_pos: None,
867                    });
868                    skip_count = 1;
869                    word_start = i + 2;
870                }
871                // Handle == as a single token
872                '=' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
873                    if !current_word.is_empty() {
874                        items.push(WordItem {
875                            word: std::mem::take(&mut current_word),
876                            trailing_punct: None,
877                            start: word_start,
878                            end: i,
879                            punct_pos: None,
880                        });
881                    }
882                    items.push(WordItem {
883                        word: "==".to_string(),
884                        trailing_punct: None,
885                        start: i,
886                        end: i + 2,
887                        punct_pos: None,
888                    });
889                    skip_count = 1;
890                    word_start = i + 2;
891                }
892                // Handle != as a single token
893                '!' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
894                    if !current_word.is_empty() {
895                        items.push(WordItem {
896                            word: std::mem::take(&mut current_word),
897                            trailing_punct: None,
898                            start: word_start,
899                            end: i,
900                            punct_pos: None,
901                        });
902                    }
903                    items.push(WordItem {
904                        word: "!=".to_string(),
905                        trailing_punct: None,
906                        start: i,
907                        end: i + 2,
908                        punct_pos: None,
909                    });
910                    skip_count = 1;
911                    word_start = i + 2;
912                }
913                // Special handling for '-' in ISO-8601 dates (YYYY-MM-DD)
914                '-' if Self::is_date_hyphen(&current_word, &chars, char_idx) => {
915                    // This hyphen is part of a date, include it in the word
916                    current_word.push(c);
917                }
918                // Special handling for ':' in time literals (9:30am, 11:45pm)
919                ':' if Self::is_time_colon(&current_word, &chars, char_idx) => {
920                    // This colon is part of a time, include it in the word
921                    current_word.push(c);
922                }
923                '(' | ')' | '[' | ']' | ',' | '?' | '!' | ':' | '+' | '-' | '*' | '/' | '%' | '<' | '>' | '=' => {
924                    if !current_word.is_empty() {
925                        items.push(WordItem {
926                            word: std::mem::take(&mut current_word),
927                            trailing_punct: Some(c),
928                            start: word_start,
929                            end: i,
930                            punct_pos: Some(i),
931                        });
932                    } else {
933                        items.push(WordItem {
934                            word: String::new(),
935                            trailing_punct: Some(c),
936                            start: i,
937                            end: next_pos,
938                            punct_pos: Some(i),
939                        });
940                    }
941                    word_start = next_pos;
942                }
943                '\'' => {
944                    // Handle contractions: expand "don't" → "do" + "not", etc.
945                    let remaining: String = chars[char_idx + 1..].iter().collect();
946                    let remaining_lower = remaining.to_lowercase();
947
948                    if remaining_lower.starts_with("t ") || remaining_lower.starts_with("t.") ||
949                       remaining_lower.starts_with("t,") || remaining_lower == "t" ||
950                       (char_idx + 1 < chars.len() && chars[char_idx + 1] == 't' &&
951                        (char_idx + 2 >= chars.len() || !chars[char_idx + 2].is_alphabetic())) {
952                        // This is a contraction ending in 't (don't, doesn't, won't, can't, etc.)
953                        let word_lower = current_word.to_lowercase();
954                        if word_lower == "don" || word_lower == "doesn" || word_lower == "didn" {
955                            // do/does/did + not
956                            let base = if word_lower == "don" { "do" }
957                                      else if word_lower == "doesn" { "does" }
958                                      else { "did" };
959                            items.push(WordItem {
960                                word: base.to_string(),
961                                trailing_punct: None,
962                                start: word_start,
963                                end: i,
964                                punct_pos: None,
965                            });
966                            items.push(WordItem {
967                                word: "not".to_string(),
968                                trailing_punct: None,
969                                start: i,
970                                end: i + 2,
971                                punct_pos: None,
972                            });
973                            current_word.clear();
974                            word_start = next_pos + 1;
975                            skip_count = 1;
976                        } else if word_lower == "won" {
977                            // will + not
978                            items.push(WordItem {
979                                word: "will".to_string(),
980                                trailing_punct: None,
981                                start: word_start,
982                                end: i,
983                                punct_pos: None,
984                            });
985                            items.push(WordItem {
986                                word: "not".to_string(),
987                                trailing_punct: None,
988                                start: i,
989                                end: i + 2,
990                                punct_pos: None,
991                            });
992                            current_word.clear();
993                            word_start = next_pos + 1;
994                            skip_count = 1;
995                        } else if word_lower == "can" {
996                            // cannot
997                            items.push(WordItem {
998                                word: "cannot".to_string(),
999                                trailing_punct: None,
1000                                start: word_start,
1001                                end: i + 2,
1002                                punct_pos: None,
1003                            });
1004                            current_word.clear();
1005                            word_start = next_pos + 1;
1006                            skip_count = 1;
1007                        } else {
1008                            // Unknown contraction, split normally
1009                            if !current_word.is_empty() {
1010                                items.push(WordItem {
1011                                    word: std::mem::take(&mut current_word),
1012                                    trailing_punct: Some('\''),
1013                                    start: word_start,
1014                                    end: i,
1015                                    punct_pos: Some(i),
1016                                });
1017                            }
1018                            word_start = next_pos;
1019                        }
1020                    } else {
1021                        // Not a 't contraction, handle normally
1022                        if !current_word.is_empty() {
1023                            items.push(WordItem {
1024                                word: std::mem::take(&mut current_word),
1025                                trailing_punct: Some('\''),
1026                                start: word_start,
1027                                end: i,
1028                                punct_pos: Some(i),
1029                            });
1030                        }
1031                        word_start = next_pos;
1032                    }
1033                }
1034                c if c.is_alphabetic() || c.is_ascii_digit() || (c == '.' && !current_word.is_empty() && current_word.chars().all(|ch| ch.is_ascii_digit())) || c == '_' => {
1035                    if current_word.is_empty() {
1036                        word_start = i;
1037                    }
1038                    current_word.push(c);
1039                }
1040                _ => {
1041                    word_start = next_pos;
1042                }
1043            }
1044            char_idx += 1;
1045        }
1046
1047        if !current_word.is_empty() {
1048            items.push(WordItem {
1049                word: current_word,
1050                trailing_punct: None,
1051                start: word_start,
1052                end: input.len(),
1053                punct_pos: None,
1054            });
1055        }
1056
1057        items
1058    }
1059
1060    fn peek_word(&self, offset: usize) -> Option<&str> {
1061        self.words.get(self.pos + offset).map(|w| w.word.as_str())
1062    }
1063
1064    fn peek_sequence(&self, expected: &[&str]) -> bool {
1065        for (i, &exp) in expected.iter().enumerate() {
1066            match self.peek_word(i + 1) {
1067                Some(w) if w.to_lowercase() == exp => continue,
1068                _ => return false,
1069            }
1070        }
1071        true
1072    }
1073
1074    fn consume_words(&mut self, count: usize) {
1075        self.pos += count;
1076    }
1077
1078    /// Tokenizes the input text and returns a vector of [`Token`]s.
1079    ///
1080    /// Each token includes its type, the interned lexeme, and the source
1081    /// span for error reporting. Words are classified according to the
1082    /// lexicon database with priority-based ambiguity resolution.
1083    ///
1084    /// # Returns
1085    ///
1086    /// A vector of tokens representing the input. The final token is
1087    /// typically `TokenType::Eof`.
1088    pub fn tokenize(&mut self) -> Vec<Token> {
1089        let mut tokens = Vec::new();
1090
1091        while self.pos < self.words.len() {
1092            let item = &self.words[self.pos];
1093            let word = item.word.clone();
1094            let trailing_punct = item.trailing_punct;
1095            let word_start = item.start;
1096            let word_end = item.end;
1097            let punct_pos = item.punct_pos;
1098
1099            if word.is_empty() {
1100                if let Some(punct) = trailing_punct {
1101                    let kind = match punct {
1102                        '(' => TokenType::LParen,
1103                        ')' => TokenType::RParen,
1104                        '[' => TokenType::LBracket,
1105                        ']' => TokenType::RBracket,
1106                        ',' => TokenType::Comma,
1107                        ':' => TokenType::Colon,
1108                        '.' | '?' => {
1109                            self.in_let_context = false;
1110                            TokenType::Period
1111                        }
1112                        '!' => TokenType::Exclamation,
1113                        '+' => TokenType::Plus,
1114                        '-' => TokenType::Minus,
1115                        '*' => TokenType::Star,
1116                        '/' => TokenType::Slash,
1117                        '%' => TokenType::Percent,
1118                        '<' => TokenType::Lt,
1119                        '>' => TokenType::Gt,
1120                        '=' => TokenType::Assign,
1121                        _ => {
1122                            self.pos += 1;
1123                            continue;
1124                        }
1125                    };
1126                    let lexeme = self.interner.intern(&punct.to_string());
1127                    let span = Span::new(word_start, word_end);
1128                    tokens.push(Token::new(kind, lexeme, span));
1129                }
1130                self.pos += 1;
1131                continue;
1132            }
1133
1134            // Check for string literal marker (pre-tokenized in Stage 1)
1135            if word.starts_with("\x00STR:") {
1136                let content = &word[5..]; // Skip the marker prefix
1137                let sym = self.interner.intern(content);
1138                let span = Span::new(word_start, word_end);
1139                tokens.push(Token::new(TokenType::StringLiteral(sym), sym, span));
1140                self.pos += 1;
1141                continue;
1142            }
1143
1144            // Check for character literal marker
1145            if word.starts_with("\x00CHAR:") {
1146                let content = &word[6..]; // Skip the marker prefix
1147                let sym = self.interner.intern(content);
1148                let span = Span::new(word_start, word_end);
1149                tokens.push(Token::new(TokenType::CharLiteral(sym), sym, span));
1150                self.pos += 1;
1151                continue;
1152            }
1153
1154            // Check for escape block marker (pre-captured raw foreign code)
1155            if word.starts_with("\x00ESC:") {
1156                let content = &word[5..]; // Skip the "\x00ESC:" prefix
1157                let sym = self.interner.intern(content);
1158                let span = Span::new(word_start, word_end);
1159                tokens.push(Token::new(TokenType::EscapeBlock(sym), sym, span));
1160                self.pos += 1;
1161                continue;
1162            }
1163
1164            let kind = self.classify_with_lookahead(&word);
1165            let lexeme = self.interner.intern(&word);
1166            let span = Span::new(word_start, word_end);
1167            tokens.push(Token::new(kind, lexeme, span));
1168
1169            if let Some(punct) = trailing_punct {
1170                if punct == '\'' {
1171                    if let Some(next_item) = self.words.get(self.pos + 1) {
1172                        if next_item.word.to_lowercase() == "s" {
1173                            let poss_lexeme = self.interner.intern("'s");
1174                            let poss_start = punct_pos.unwrap_or(word_end);
1175                            let poss_end = next_item.end;
1176                            tokens.push(Token::new(TokenType::Possessive, poss_lexeme, Span::new(poss_start, poss_end)));
1177                            self.pos += 1;
1178                            if let Some(s_punct) = next_item.trailing_punct {
1179                                let kind = match s_punct {
1180                                    '(' => TokenType::LParen,
1181                                    ')' => TokenType::RParen,
1182                                    '[' => TokenType::LBracket,
1183                                    ']' => TokenType::RBracket,
1184                                    ',' => TokenType::Comma,
1185                                    ':' => TokenType::Colon,
1186                                    '.' | '?' => TokenType::Period,
1187                                    '!' => TokenType::Exclamation,
1188                                    '+' => TokenType::Plus,
1189                                    '-' => TokenType::Minus,
1190                                    '*' => TokenType::Star,
1191                                    '/' => TokenType::Slash,
1192                                    '%' => TokenType::Percent,
1193                                    '<' => TokenType::Lt,
1194                                    '>' => TokenType::Gt,
1195                                    '=' => TokenType::Assign,
1196                                    _ => {
1197                                        self.pos += 1;
1198                                        continue;
1199                                    }
1200                                };
1201                                let s_punct_pos = next_item.punct_pos.unwrap_or(next_item.end);
1202                                let lexeme = self.interner.intern(&s_punct.to_string());
1203                                tokens.push(Token::new(kind, lexeme, Span::new(s_punct_pos, s_punct_pos + 1)));
1204                            }
1205                            self.pos += 1;
1206                            continue;
1207                        }
1208                    }
1209                    self.pos += 1;
1210                    continue;
1211                }
1212
1213                let kind = match punct {
1214                    '(' => TokenType::LParen,
1215                    ')' => TokenType::RParen,
1216                    '[' => TokenType::LBracket,
1217                    ']' => TokenType::RBracket,
1218                    ',' => TokenType::Comma,
1219                    ':' => TokenType::Colon,
1220                    '.' | '?' => {
1221                        self.in_let_context = false;
1222                        TokenType::Period
1223                    }
1224                    '!' => TokenType::Exclamation,
1225                    '+' => TokenType::Plus,
1226                    '-' => TokenType::Minus,
1227                    '*' => TokenType::Star,
1228                    '/' => TokenType::Slash,
1229                    '%' => TokenType::Percent,
1230                    '<' => TokenType::Lt,
1231                    '>' => TokenType::Gt,
1232                    '=' => TokenType::Assign,
1233                    _ => {
1234                        self.pos += 1;
1235                        continue;
1236                    }
1237                };
1238                let p_start = punct_pos.unwrap_or(word_end);
1239                let lexeme = self.interner.intern(&punct.to_string());
1240                tokens.push(Token::new(kind, lexeme, Span::new(p_start, p_start + 1)));
1241            }
1242
1243            self.pos += 1;
1244        }
1245
1246        let eof_lexeme = self.interner.intern("");
1247        let eof_span = Span::new(self.input_len, self.input_len);
1248        tokens.push(Token::new(TokenType::EOF, eof_lexeme, eof_span));
1249
1250        self.insert_indentation_tokens(tokens)
1251    }
1252
1253    /// Insert Indent/Dedent tokens using LineLexer's two-pass architecture (Spec §2.5.2).
1254    ///
1255    /// Phase 1: LineLexer determines the structural layout (where indents/dedents occur)
1256    /// Phase 2: We correlate these with word token positions
1257    fn insert_indentation_tokens(&mut self, tokens: Vec<Token>) -> Vec<Token> {
1258        let mut result = Vec::new();
1259        let empty_sym = self.interner.intern("");
1260
1261        // Phase 1: Run LineLexer to determine structural positions
1262        let line_lexer = LineLexer::new(&self.source);
1263        let line_tokens: Vec<LineToken> = line_lexer.collect();
1264
1265        // Build a list of (byte_position, is_indent) for structural tokens
1266        // Position is where the NEXT Content starts after the Indent/Dedent
1267        let mut structural_events: Vec<(usize, bool)> = Vec::new(); // (byte_pos, true=Indent, false=Dedent)
1268        let mut pending_indents = 0usize;
1269        let mut pending_dedents = 0usize;
1270
1271        for line_token in &line_tokens {
1272            match line_token {
1273                LineToken::Indent => {
1274                    pending_indents += 1;
1275                }
1276                LineToken::Dedent => {
1277                    pending_dedents += 1;
1278                }
1279                LineToken::Content { start, .. } => {
1280                    // Emit pending dedents first (they come BEFORE the content)
1281                    for _ in 0..pending_dedents {
1282                        structural_events.push((*start, false)); // false = Dedent
1283                    }
1284                    pending_dedents = 0;
1285
1286                    // Emit pending indents (they also come BEFORE the content)
1287                    for _ in 0..pending_indents {
1288                        structural_events.push((*start, true)); // true = Indent
1289                    }
1290                    pending_indents = 0;
1291                }
1292                LineToken::Newline => {}
1293            }
1294        }
1295
1296        // Handle any remaining dedents at EOF
1297        for _ in 0..pending_dedents {
1298            structural_events.push((self.input_len, false));
1299        }
1300
1301        // Filter out structural events from within escape block bodies.
1302        // The LineLexer sees raw Rust code lines and generates spurious Indent/Dedent
1303        // events for their indentation changes. We keep exactly the boundary events
1304        // (Indent at body start, Dedent at body end) but remove internal ones.
1305        if !self.escape_body_ranges.is_empty() {
1306            // For each escape body range, find the first Indent at the body start and
1307            // track that we're inside the range. Filter out all events strictly inside
1308            // the range except for the first Indent and events at/after the end.
1309            let mut filtered = Vec::new();
1310            for &(pos, is_indent) in &structural_events {
1311                let is_inside_escape_body = self.escape_body_ranges.iter().any(|(start, end)| {
1312                    // Strictly inside the body (not at start boundary and not at/after end)
1313                    pos > *start && pos < *end
1314                });
1315                if !is_inside_escape_body {
1316                    filtered.push((pos, is_indent));
1317                }
1318            }
1319            structural_events = filtered;
1320        }
1321
1322        // Sort events by position, with dedents before indents at same position
1323        structural_events.sort_by(|a, b| {
1324            if a.0 != b.0 {
1325                a.0.cmp(&b.0)
1326            } else {
1327                // Dedents (false) before Indents (true) at same position
1328                a.1.cmp(&b.1)
1329            }
1330        });
1331
1332        // Phase 2: Insert structural tokens at the right positions
1333        // Strategy: For each word token, check if any structural events should be inserted
1334        // before it (based on byte position)
1335
1336        let mut event_idx = 0;
1337        let mut last_colon_pos: Option<usize> = None;
1338
1339        for token in tokens.iter() {
1340            let token_start = token.span.start;
1341
1342            // Insert any structural tokens that should come BEFORE this token
1343            while event_idx < structural_events.len() {
1344                let (event_pos, is_indent) = structural_events[event_idx];
1345
1346                // Insert structural tokens before this token if the event position <= token start
1347                if event_pos <= token_start {
1348                    let span = if is_indent {
1349                        // Indent is inserted after the preceding Colon
1350                        Span::new(last_colon_pos.unwrap_or(event_pos), last_colon_pos.unwrap_or(event_pos))
1351                    } else {
1352                        Span::new(event_pos, event_pos)
1353                    };
1354                    let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1355                    result.push(Token::new(kind, empty_sym, span));
1356                    event_idx += 1;
1357                } else {
1358                    break;
1359                }
1360            }
1361
1362            result.push(token.clone());
1363
1364            // Track colon positions for Indent span calculation
1365            if token.kind == TokenType::Colon && self.is_end_of_line(token.span.end) {
1366                last_colon_pos = Some(token.span.end);
1367            }
1368        }
1369
1370        // Insert any remaining structural tokens (typically Dedents at EOF)
1371        while event_idx < structural_events.len() {
1372            let (event_pos, is_indent) = structural_events[event_idx];
1373            let span = Span::new(event_pos, event_pos);
1374            let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1375            result.push(Token::new(kind, empty_sym, span));
1376            event_idx += 1;
1377        }
1378
1379        // Ensure EOF is at the end
1380        let eof_pos = result.iter().position(|t| t.kind == TokenType::EOF);
1381        if let Some(pos) = eof_pos {
1382            let eof = result.remove(pos);
1383            result.push(eof);
1384        }
1385
1386        result
1387    }
1388
1389    /// Check if position is at end of line (only whitespace until newline)
1390    fn is_end_of_line(&self, from_pos: usize) -> bool {
1391        let bytes = self.source.as_bytes();
1392        let mut pos = from_pos;
1393        while pos < bytes.len() {
1394            match bytes[pos] {
1395                b' ' | b'\t' => pos += 1,
1396                b'\n' => return true,
1397                _ => return false,
1398            }
1399        }
1400        true // End of input is also end of line
1401    }
1402
1403    fn measure_next_line_indent(&self, from_pos: usize) -> Option<usize> {
1404        let bytes = self.source.as_bytes();
1405        let mut pos = from_pos;
1406
1407        while pos < bytes.len() && bytes[pos] != b'\n' {
1408            pos += 1;
1409        }
1410
1411        if pos >= bytes.len() {
1412            return None;
1413        }
1414
1415        pos += 1;
1416
1417        let mut indent = 0;
1418        while pos < bytes.len() {
1419            match bytes[pos] {
1420                b' ' => indent += 1,
1421                b'\t' => indent += 4,
1422                b'\n' => {
1423                    indent = 0;
1424                }
1425                _ => break,
1426            }
1427            pos += 1;
1428        }
1429
1430        if pos >= bytes.len() {
1431            return None;
1432        }
1433
1434        Some(indent)
1435    }
1436
1437    fn word_to_number(word: &str) -> Option<u32> {
1438        lexicon::word_to_number(&word.to_lowercase())
1439    }
1440
1441    /// Check if a hyphen at the current position is part of an ISO-8601 date.
1442    ///
1443    /// Detects patterns like:
1444    /// - "2026-" followed by "05-20" → first hyphen of date
1445    /// - "2026-05-" followed by "20" → second hyphen of date
1446    fn is_date_hyphen(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1447        // Current word must be all digits (year or year-month)
1448        let word_chars: Vec<char> = current_word.chars().collect();
1449
1450        // Check for first hyphen pattern: YYYY- followed by MM-DD
1451        if word_chars.len() == 4 && word_chars.iter().all(|c| c.is_ascii_digit()) {
1452            // Check if followed by exactly 2 digits, hyphen, 2 digits
1453            if char_idx + 5 < chars.len()
1454                && chars[char_idx + 1].is_ascii_digit()
1455                && chars[char_idx + 2].is_ascii_digit()
1456                && chars[char_idx + 3] == '-'
1457                && chars[char_idx + 4].is_ascii_digit()
1458                && chars[char_idx + 5].is_ascii_digit()
1459            {
1460                return true;
1461            }
1462        }
1463
1464        // Check for second hyphen pattern: YYYY-MM- followed by DD
1465        if word_chars.len() == 7
1466            && word_chars[0..4].iter().all(|c| c.is_ascii_digit())
1467            && word_chars[4] == '-'
1468            && word_chars[5..7].iter().all(|c| c.is_ascii_digit())
1469        {
1470            // Check if followed by exactly 2 digits
1471            if char_idx + 2 < chars.len()
1472                && chars[char_idx + 1].is_ascii_digit()
1473                && chars[char_idx + 2].is_ascii_digit()
1474            {
1475                // Make sure we're not followed by more digits (would be a longer number)
1476                let next_not_digit = char_idx + 3 >= chars.len()
1477                    || !chars[char_idx + 3].is_ascii_digit();
1478                if next_not_digit {
1479                    return true;
1480                }
1481            }
1482        }
1483
1484        false
1485    }
1486
1487    /// Check if a colon is part of a time literal (e.g., 9:30am, 11:45pm).
1488    ///
1489    /// Detects patterns like:
1490    /// - "9:" followed by "30am" or "30pm"
1491    /// - "11:" followed by "45pm"
1492    fn is_time_colon(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1493        // Current word must be 1-2 digits (hour)
1494        let word_chars: Vec<char> = current_word.chars().collect();
1495        if word_chars.is_empty() || word_chars.len() > 2 {
1496            return false;
1497        }
1498        if !word_chars.iter().all(|c| c.is_ascii_digit()) {
1499            return false;
1500        }
1501
1502        // Check if followed by exactly 2 digits and then "am" or "pm"
1503        if char_idx + 4 < chars.len()
1504            && chars[char_idx + 1].is_ascii_digit()
1505            && chars[char_idx + 2].is_ascii_digit()
1506        {
1507            // Check for "am" or "pm" suffix
1508            let next_two: String = chars[char_idx + 3..char_idx + 5].iter().collect();
1509            let lower = next_two.to_lowercase();
1510            if lower == "am" || lower == "pm" {
1511                // Make sure we're not followed by more alphabetic chars
1512                let after_suffix = char_idx + 5 >= chars.len()
1513                    || !chars[char_idx + 5].is_alphabetic();
1514                if after_suffix {
1515                    return true;
1516                }
1517            }
1518        }
1519
1520        false
1521    }
1522
1523    fn is_numeric_literal(word: &str) -> bool {
1524        if word.is_empty() {
1525            return false;
1526        }
1527        let chars: Vec<char> = word.chars().collect();
1528        let first = chars[0];
1529        if first.is_ascii_digit() {
1530            // Numeric literal: starts with digit (may have underscore separators like 1_000)
1531            return true;
1532        }
1533        // Symbolic numbers: only recognize known mathematical symbols
1534        // (aleph, omega, beth) followed by underscore and digits
1535        if let Some(underscore_pos) = word.rfind('_') {
1536            let before_underscore = &word[..underscore_pos];
1537            let after_underscore = &word[underscore_pos + 1..];
1538            // Must be a known mathematical symbol prefix AND digits after underscore
1539            let is_math_symbol = matches!(
1540                before_underscore.to_lowercase().as_str(),
1541                "aleph" | "omega" | "beth"
1542            );
1543            if is_math_symbol
1544                && !after_underscore.is_empty()
1545                && after_underscore.chars().all(|c| c.is_ascii_digit())
1546            {
1547                return true;
1548            }
1549        }
1550        false
1551    }
1552
1553    /// Parse a duration literal with SI suffix.
1554    ///
1555    /// Returns Some((nanoseconds, unit_str)) if the word is a valid duration literal,
1556    /// None otherwise.
1557    ///
1558    /// Supported suffixes:
1559    /// - ns: nanoseconds
1560    /// - us, μs: microseconds
1561    /// - ms: milliseconds
1562    /// - s, sec: seconds
1563    /// - min: minutes
1564    /// - h, hr: hours
1565    fn parse_duration_literal(word: &str) -> Option<(i64, &str)> {
1566        if word.is_empty() || !word.chars().next()?.is_ascii_digit() {
1567            return None;
1568        }
1569
1570        // SI suffix table with multipliers to nanoseconds
1571        const SUFFIXES: &[(&str, i64)] = &[
1572            ("ns", 1),
1573            ("μs", 1_000),
1574            ("us", 1_000),
1575            ("ms", 1_000_000),
1576            ("sec", 1_000_000_000),
1577            ("s", 1_000_000_000),
1578            ("min", 60_000_000_000),
1579            ("hr", 3_600_000_000_000),
1580            ("h", 3_600_000_000_000),
1581        ];
1582
1583        // Try each suffix (longer suffixes first to avoid partial matches)
1584        for (suffix, multiplier) in SUFFIXES {
1585            if word.ends_with(suffix) {
1586                let num_part = &word[..word.len() - suffix.len()];
1587                // Parse the numeric part (may have underscore separators)
1588                let cleaned: String = num_part.chars().filter(|c| *c != '_').collect();
1589                if let Ok(n) = cleaned.parse::<i64>() {
1590                    return Some((n.saturating_mul(*multiplier), *suffix));
1591                }
1592            }
1593        }
1594
1595        None
1596    }
1597
1598    /// Parse an ISO-8601 date literal (YYYY-MM-DD).
1599    ///
1600    /// Returns Some(days_since_epoch) if the word is a valid date literal,
1601    /// None otherwise.
1602    fn parse_date_literal(word: &str) -> Option<i32> {
1603        // Must match pattern: YYYY-MM-DD
1604        if word.len() != 10 {
1605            return None;
1606        }
1607
1608        let bytes = word.as_bytes();
1609
1610        // Check format: 4 digits, hyphen, 2 digits, hyphen, 2 digits
1611        if bytes[4] != b'-' || bytes[7] != b'-' {
1612            return None;
1613        }
1614
1615        // Parse year, month, day
1616        let year: i32 = word[0..4].parse().ok()?;
1617        let month: u32 = word[5..7].parse().ok()?;
1618        let day: u32 = word[8..10].parse().ok()?;
1619
1620        // Basic validation
1621        if month < 1 || month > 12 || day < 1 || day > 31 {
1622            return None;
1623        }
1624
1625        // Convert to days since Unix epoch using Howard Hinnant's algorithm
1626        // https://howardhinnant.github.io/date_algorithms.html
1627        let y = if month <= 2 { year - 1 } else { year };
1628        let era = if y >= 0 { y / 400 } else { (y - 399) / 400 };
1629        let yoe = (y - era * 400) as u32;
1630        let m = month;
1631        let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) + 2) / 5 + day - 1;
1632        let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1633        let days = era * 146097 + doe as i32 - 719468;
1634
1635        Some(days)
1636    }
1637
1638    /// Parse a time-of-day literal.
1639    ///
1640    /// Supported formats:
1641    /// - 12-hour with am/pm: "4pm", "9am", "12pm"
1642    /// - 12-hour with minutes: "9:30am", "11:45pm"
1643    /// - Special words: "noon" (12:00), "midnight" (00:00)
1644    ///
1645    /// Returns Some(nanos_from_midnight) if valid, None otherwise.
1646    fn parse_time_literal(word: &str) -> Option<i64> {
1647        let lower = word.to_lowercase();
1648
1649        // Handle special time words
1650        if lower == "noon" {
1651            return Some(12i64 * 3600 * 1_000_000_000);
1652        }
1653        if lower == "midnight" {
1654            return Some(0);
1655        }
1656
1657        // Handle 12-hour formats: "4pm", "9am", "9:30am", "11:45pm"
1658        let is_pm = lower.ends_with("pm");
1659        let is_am = lower.ends_with("am");
1660
1661        if !is_pm && !is_am {
1662            return None;
1663        }
1664
1665        // Strip the am/pm suffix
1666        let time_part = &lower[..lower.len() - 2];
1667
1668        // Check for hour:minute format
1669        let (hour, minute): (i64, i64) = if let Some(colon_idx) = time_part.find(':') {
1670            let hour_str = &time_part[..colon_idx];
1671            let min_str = &time_part[colon_idx + 1..];
1672            let h: i64 = hour_str.parse().ok()?;
1673            let m: i64 = min_str.parse().ok()?;
1674            (h, m)
1675        } else {
1676            // Just hour: "4pm", "9am"
1677            let h: i64 = time_part.parse().ok()?;
1678            (h, 0)
1679        };
1680
1681        // Validate ranges
1682        if hour < 1 || hour > 12 || minute < 0 || minute > 59 {
1683            return None;
1684        }
1685
1686        // Convert to 24-hour format
1687        let hour_24 = if is_am {
1688            if hour == 12 { 0 } else { hour }  // 12am = midnight = 0
1689        } else {
1690            if hour == 12 { 12 } else { hour + 12 }  // 12pm = noon = 12, 4pm = 16
1691        };
1692
1693        // Convert to nanoseconds from midnight
1694        let nanos = (hour_24 * 3600 + minute * 60) * 1_000_000_000;
1695        Some(nanos)
1696    }
1697
1698    fn classify_with_lookahead(&mut self, word: &str) -> TokenType {
1699        // Handle block headers (##Theorem, ##Main, etc.)
1700        if word.starts_with("##") {
1701            let block_name = &word[2..];
1702            let block_type = match block_name.to_lowercase().as_str() {
1703                "theorem" => BlockType::Theorem,
1704                "main" => BlockType::Main,
1705                "definition" => BlockType::Definition,
1706                "proof" => BlockType::Proof,
1707                "example" => BlockType::Example,
1708                "logic" => BlockType::Logic,
1709                "note" => BlockType::Note,
1710                "to" => BlockType::Function,  // Function definition block
1711                "a" | "an" => BlockType::TypeDef,  // Inline type definitions: ## A Point has:
1712                "policy" => BlockType::Policy,  // Security policy definitions
1713                "requires" => BlockType::Requires,  // External crate dependencies
1714                _ => BlockType::Note, // Default unknown block types to Note
1715            };
1716
1717            // Update lexer mode based on block type
1718            self.mode = match block_type {
1719                BlockType::Main | BlockType::Function => LexerMode::Imperative,
1720                _ => LexerMode::Declarative,
1721            };
1722
1723            return TokenType::BlockHeader { block_type };
1724        }
1725
1726        let lower = word.to_lowercase();
1727
1728        if lower == "each" && self.peek_sequence(&["other"]) {
1729            self.consume_words(1);
1730            return TokenType::Reciprocal;
1731        }
1732
1733        if lower == "to" {
1734            if let Some(next) = self.peek_word(1) {
1735                if self.is_verb_like(next) {
1736                    return TokenType::To;
1737                }
1738            }
1739            let sym = self.interner.intern("to");
1740            return TokenType::Preposition(sym);
1741        }
1742
1743        if lower == "at" {
1744            if let Some(next) = self.peek_word(1) {
1745                let next_lower = next.to_lowercase();
1746                if next_lower == "least" {
1747                    if let Some(num_word) = self.peek_word(2) {
1748                        if let Some(n) = Self::word_to_number(num_word) {
1749                            self.consume_words(2);
1750                            return TokenType::AtLeast(n);
1751                        }
1752                    }
1753                }
1754                if next_lower == "most" {
1755                    if let Some(num_word) = self.peek_word(2) {
1756                        if let Some(n) = Self::word_to_number(num_word) {
1757                            self.consume_words(2);
1758                            return TokenType::AtMost(n);
1759                        }
1760                    }
1761                }
1762            }
1763        }
1764
1765        if let Some(n) = Self::word_to_number(&lower) {
1766            return TokenType::Cardinal(n);
1767        }
1768
1769        // Check for duration literal first (e.g., "500ms", "2s", "50ns")
1770        if let Some((nanos, unit)) = Self::parse_duration_literal(word) {
1771            let unit_sym = self.interner.intern(unit);
1772            return TokenType::DurationLiteral {
1773                nanos,
1774                original_unit: unit_sym,
1775            };
1776        }
1777
1778        // Check for ISO-8601 date literal (e.g., "2026-05-20")
1779        if let Some(days) = Self::parse_date_literal(word) {
1780            return TokenType::DateLiteral { days };
1781        }
1782
1783        // Check for time-of-day literal (e.g., "4pm", "9:30am", "noon", "midnight")
1784        if let Some(nanos_from_midnight) = Self::parse_time_literal(word) {
1785            return TokenType::TimeLiteral { nanos_from_midnight };
1786        }
1787
1788        if Self::is_numeric_literal(word) {
1789            let sym = self.interner.intern(word);
1790            return TokenType::Number(sym);
1791        }
1792
1793        if lower == "if" && self.peek_sequence(&["and", "only", "if"]) {
1794            self.consume_words(3);
1795            return TokenType::Iff;
1796        }
1797
1798        if lower == "is" {
1799            if self.peek_sequence(&["equal", "to"]) {
1800                self.consume_words(2);
1801                return TokenType::Identity;
1802            }
1803            if self.peek_sequence(&["identical", "to"]) {
1804                self.consume_words(2);
1805                return TokenType::Identity;
1806            }
1807        }
1808
1809        if (lower == "a" || lower == "an") && word.chars().next().unwrap().is_uppercase() {
1810            // Capitalized "A" or "An" - disambiguate article vs proper name
1811            // Heuristic: articles are followed by nouns/adjectives, not verbs or keywords
1812            if let Some(next) = self.peek_word(1) {
1813                let next_lower = next.to_lowercase();
1814                let next_starts_lowercase = next.chars().next().map(|c| c.is_lowercase()).unwrap_or(false);
1815
1816                // If followed by logical keyword, treat as proper name (propositional variable)
1817                if matches!(next_lower.as_str(), "if" | "and" | "or" | "implies" | "iff") {
1818                    let sym = self.interner.intern(word);
1819                    return TokenType::ProperName(sym);
1820                }
1821
1822                // If next word is ONLY a verb (like "has", "is", "ran"), A is likely a name
1823                // Exception: gerunds (like "running") can follow articles
1824                // Exception: words in disambiguation_not_verbs (like "red") are not verbs
1825                // Exception: words that are also nouns/adjectives (like "fire") can follow articles
1826                let is_verb = self.lexicon.lookup_verb(&next_lower).is_some()
1827                    && !lexicon::is_disambiguation_not_verb(&next_lower);
1828                let is_gerund = next_lower.ends_with("ing");
1829                let is_also_noun_or_adj = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1830                if is_verb && !is_gerund && !is_also_noun_or_adj {
1831                    let sym = self.interner.intern(word);
1832                    return TokenType::ProperName(sym);
1833                }
1834
1835                // Definition pattern: "A [TypeName] is a..." or "A [TypeName] has:" - treat A as article
1836                // even when TypeName is capitalized and unknown
1837                if let Some(third) = self.peek_word(2) {
1838                    let third_lower = third.to_lowercase();
1839                    // "has" for struct definitions: "A Point has:"
1840                    if third_lower == "is" || third_lower == "are" || third_lower == "has" {
1841                        return TokenType::Article(Definiteness::Indefinite);
1842                    }
1843                }
1844
1845                // It's an article if next word is:
1846                // - A known noun or adjective, or
1847                // - Lowercase (likely a common word we don't recognize)
1848                let is_content_word = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1849                if is_content_word || next_starts_lowercase {
1850                    return TokenType::Article(Definiteness::Indefinite);
1851                }
1852            }
1853            let sym = self.interner.intern(word);
1854            return TokenType::ProperName(sym);
1855        }
1856
1857        self.classify_word(word)
1858    }
1859
1860    fn is_noun_like(&self, word: &str) -> bool {
1861        if lexicon::is_noun_pattern(word) || lexicon::is_common_noun(word) {
1862            return true;
1863        }
1864        if word.ends_with("er") || word.ends_with("ian") || word.ends_with("ist") {
1865            return true;
1866        }
1867        false
1868    }
1869
1870    fn is_adjective_like(&self, word: &str) -> bool {
1871        lexicon::is_adjective(word) || lexicon::is_non_intersective(word)
1872    }
1873
1874    fn classify_word(&mut self, word: &str) -> TokenType {
1875        let lower = word.to_lowercase();
1876        let first_char = word.chars().next().unwrap();
1877
1878        // Disambiguate "that" as determiner vs complementizer
1879        // "that dog" → Article(Distal), "I know that he ran" → That (complementizer)
1880        if lower == "that" {
1881            if let Some(next) = self.peek_word(1) {
1882                let next_lower = next.to_lowercase();
1883                if self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower) {
1884                    return TokenType::Article(Definiteness::Distal);
1885                }
1886            }
1887        }
1888
1889        // Arrow token for return type syntax
1890        if word == "->" {
1891            return TokenType::Arrow;
1892        }
1893
1894        // Grand Challenge: Comparison operator tokens
1895        if word == "<=" {
1896            return TokenType::LtEq;
1897        }
1898        if word == ">=" {
1899            return TokenType::GtEq;
1900        }
1901        if word == "==" {
1902            return TokenType::EqEq;
1903        }
1904        if word == "!=" {
1905            return TokenType::NotEq;
1906        }
1907        if word == "<" {
1908            return TokenType::Lt;
1909        }
1910        if word == ">" {
1911            return TokenType::Gt;
1912        }
1913        // Single = for assignment (must come after == check)
1914        if word == "=" {
1915            return TokenType::Assign;
1916        }
1917
1918        if let Some(kind) = lexicon::lookup_keyword(&lower) {
1919            return kind;
1920        }
1921
1922        if let Some(kind) = lexicon::lookup_pronoun(&lower) {
1923            return kind;
1924        }
1925
1926        if let Some(def) = lexicon::lookup_article(&lower) {
1927            return TokenType::Article(def);
1928        }
1929
1930        if let Some(time) = lexicon::lookup_auxiliary(&lower) {
1931            return TokenType::Auxiliary(time);
1932        }
1933
1934        // Handle imperative keywords that might conflict with prepositions
1935        match lower.as_str() {
1936            "call" => return TokenType::Call,
1937            "in" if self.mode == LexerMode::Imperative => return TokenType::In,
1938            // Zone keywords (must come before is_preposition check)
1939            "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
1940            // "at" for chunk access (must come before is_preposition check)
1941            "at" if self.mode == LexerMode::Imperative => return TokenType::At,
1942            // "into" for pipe send (must come before is_preposition check)
1943            "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
1944            // Temporal span operator (must come before is_preposition check)
1945            "before" => return TokenType::Before,
1946            _ => {}
1947        }
1948
1949        if lexicon::is_preposition(&lower) {
1950            let sym = self.interner.intern(&lower);
1951            return TokenType::Preposition(sym);
1952        }
1953
1954        match lower.as_str() {
1955            "equals" => return TokenType::Equals,
1956            "item" => return TokenType::Item,
1957            "items" => return TokenType::Items,
1958            // Mutability keyword for `mut x = 5` syntax
1959            "mut" if self.mode == LexerMode::Imperative => return TokenType::Mut,
1960            "let" => {
1961                self.in_let_context = true;
1962                return TokenType::Let;
1963            }
1964            "set" => {
1965                // Check if "set" is used as a type (followed by "of") - "Set of Int"
1966                // This takes priority over the assignment keyword
1967                if self.peek_word(1).map_or(false, |w| w.to_lowercase() == "of") {
1968                    // It's a type like "Set of Int" - don't return keyword, let it be a noun
1969                } else if self.mode == LexerMode::Imperative {
1970                    // In Imperative mode, treat "set" as the assignment keyword
1971                    return TokenType::Set;
1972                } else {
1973                    // In Declarative mode, check positions 2-5 for "to"
1974                    // (handles field access like "set p's x to")
1975                    for offset in 2..=5 {
1976                        if self.peek_word(offset).map_or(false, |w| w.to_lowercase() == "to") {
1977                            return TokenType::Set;
1978                        }
1979                    }
1980                }
1981            }
1982            "return" => return TokenType::Return,
1983            "be" if self.in_let_context => {
1984                self.in_let_context = false;
1985                return TokenType::Be;
1986            }
1987            "while" => return TokenType::While,
1988            "assert" => return TokenType::Assert,
1989            "trust" => return TokenType::Trust,
1990            "check" => return TokenType::Check,
1991            // Theorem keywords (Declarative mode - for theorem blocks)
1992            "given" if self.mode == LexerMode::Declarative => return TokenType::Given,
1993            "prove" if self.mode == LexerMode::Declarative => return TokenType::Prove,
1994            "auto" if self.mode == LexerMode::Declarative => return TokenType::Auto,
1995            // P2P Networking keywords (Imperative mode only)
1996            "listen" if self.mode == LexerMode::Imperative => return TokenType::Listen,
1997            "connect" if self.mode == LexerMode::Imperative => return TokenType::NetConnect,
1998            "sleep" if self.mode == LexerMode::Imperative => return TokenType::Sleep,
1999            // GossipSub keywords (Imperative mode only)
2000            "sync" if self.mode == LexerMode::Imperative => return TokenType::Sync,
2001            // Persistence keywords
2002            "mount" if self.mode == LexerMode::Imperative => return TokenType::Mount,
2003            "persistent" => return TokenType::Persistent,  // Works in type expressions
2004            "combined" if self.mode == LexerMode::Imperative => return TokenType::Combined,
2005            // Go-like Concurrency keywords (Imperative mode only)
2006            // Note: "first" and "after" are NOT keywords - they're checked via lookahead in parser
2007            // to avoid conflicting with their use as variable names
2008            "launch" if self.mode == LexerMode::Imperative => return TokenType::Launch,
2009            "task" if self.mode == LexerMode::Imperative => return TokenType::Task,
2010            "pipe" if self.mode == LexerMode::Imperative => return TokenType::Pipe,
2011            "receive" if self.mode == LexerMode::Imperative => return TokenType::Receive,
2012            "stop" if self.mode == LexerMode::Imperative => return TokenType::Stop,
2013            "try" if self.mode == LexerMode::Imperative => return TokenType::Try,
2014            "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2015            "native" => return TokenType::Native,
2016            "escape" if self.mode == LexerMode::Imperative => return TokenType::Escape,
2017            "from" => return TokenType::From,
2018            "otherwise" => return TokenType::Otherwise,
2019            // Phase 30c: Else/elif as aliases for Otherwise/Otherwise If
2020            "else" => return TokenType::Else,
2021            "elif" => return TokenType::Elif,
2022            // Sum type definition (Declarative mode only - for enum "either...or...")
2023            "either" if self.mode == LexerMode::Declarative => return TokenType::Either,
2024            // Pattern matching statement
2025            "inspect" if self.mode == LexerMode::Imperative => return TokenType::Inspect,
2026            // Constructor keyword (Imperative mode only)
2027            "new" if self.mode == LexerMode::Imperative => return TokenType::New,
2028            // Only emit Give/Show as keywords in Imperative mode
2029            // In Declarative mode, they fall through to lexicon lookup as verbs
2030            "give" if self.mode == LexerMode::Imperative => return TokenType::Give,
2031            "show" if self.mode == LexerMode::Imperative => return TokenType::Show,
2032            // Collection operation keywords (Imperative mode only)
2033            "push" if self.mode == LexerMode::Imperative => return TokenType::Push,
2034            "pop" if self.mode == LexerMode::Imperative => return TokenType::Pop,
2035            "copy" if self.mode == LexerMode::Imperative => return TokenType::Copy,
2036            "through" if self.mode == LexerMode::Imperative => return TokenType::Through,
2037            "length" if self.mode == LexerMode::Imperative => return TokenType::Length,
2038            "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2039            // Set operation keywords (Imperative mode only)
2040            "add" if self.mode == LexerMode::Imperative => return TokenType::Add,
2041            "remove" if self.mode == LexerMode::Imperative => return TokenType::Remove,
2042            "contains" if self.mode == LexerMode::Imperative => return TokenType::Contains,
2043            "union" if self.mode == LexerMode::Imperative => return TokenType::Union,
2044            "intersection" if self.mode == LexerMode::Imperative => return TokenType::Intersection,
2045            // Zone keywords (Imperative mode only)
2046            "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2047            "zone" if self.mode == LexerMode::Imperative => return TokenType::Zone,
2048            "called" if self.mode == LexerMode::Imperative => return TokenType::Called,
2049            "size" if self.mode == LexerMode::Imperative => return TokenType::Size,
2050            "mapped" if self.mode == LexerMode::Imperative => return TokenType::Mapped,
2051            // Structured Concurrency keywords (Imperative mode only)
2052            "attempt" if self.mode == LexerMode::Imperative => return TokenType::Attempt,
2053            "following" if self.mode == LexerMode::Imperative => return TokenType::Following,
2054            "simultaneously" if self.mode == LexerMode::Imperative => return TokenType::Simultaneously,
2055            // IO keywords (Imperative mode only)
2056            "read" if self.mode == LexerMode::Imperative => return TokenType::Read,
2057            "write" if self.mode == LexerMode::Imperative => return TokenType::Write,
2058            "console" if self.mode == LexerMode::Imperative => return TokenType::Console,
2059            "file" if self.mode == LexerMode::Imperative => return TokenType::File,
2060            // Agent System keywords (Imperative mode only)
2061            "spawn" if self.mode == LexerMode::Imperative => return TokenType::Spawn,
2062            "send" if self.mode == LexerMode::Imperative => return TokenType::Send,
2063            "await" if self.mode == LexerMode::Imperative => return TokenType::Await,
2064            // Serialization keyword (works in Definition blocks too)
2065            "portable" => return TokenType::Portable,
2066            // Sipping Protocol keywords (Imperative mode only)
2067            "manifest" if self.mode == LexerMode::Imperative => return TokenType::Manifest,
2068            "chunk" if self.mode == LexerMode::Imperative => return TokenType::Chunk,
2069            // CRDT keywords
2070            "shared" => return TokenType::Shared,  // Works in Definition blocks like Portable
2071            "merge" if self.mode == LexerMode::Imperative => return TokenType::Merge,
2072            "increase" if self.mode == LexerMode::Imperative => return TokenType::Increase,
2073            // Extended CRDT keywords
2074            "decrease" if self.mode == LexerMode::Imperative => return TokenType::Decrease,
2075            "append" if self.mode == LexerMode::Imperative => return TokenType::Append,
2076            "resolve" if self.mode == LexerMode::Imperative => return TokenType::Resolve,
2077            "values" if self.mode == LexerMode::Imperative => return TokenType::Values,
2078            // Type keywords (work in both modes like "Shared"):
2079            "tally" => return TokenType::Tally,
2080            "sharedset" => return TokenType::SharedSet,
2081            "sharedsequence" => return TokenType::SharedSequence,
2082            "collaborativesequence" => return TokenType::CollaborativeSequence,
2083            "sharedmap" => return TokenType::SharedMap,
2084            "divergent" => return TokenType::Divergent,
2085            "removewins" => return TokenType::RemoveWins,
2086            "addwins" => return TokenType::AddWins,
2087            "yata" => return TokenType::YATA,
2088            // Calendar time unit words (Span expressions)
2089            "day" | "days" => return TokenType::CalendarUnit(CalendarUnit::Day),
2090            "week" | "weeks" => return TokenType::CalendarUnit(CalendarUnit::Week),
2091            "month" | "months" => return TokenType::CalendarUnit(CalendarUnit::Month),
2092            "year" | "years" => return TokenType::CalendarUnit(CalendarUnit::Year),
2093            // Span-related keywords (note: "before" is handled earlier to avoid preposition conflict)
2094            "ago" => return TokenType::Ago,
2095            "hence" => return TokenType::Hence,
2096            "if" => return TokenType::If,
2097            "only" => return TokenType::Focus(FocusKind::Only),
2098            "even" => return TokenType::Focus(FocusKind::Even),
2099            "just" if self.peek_word(1).map_or(false, |w| {
2100                !self.is_verb_like(w) || w.to_lowercase() == "john" || w.chars().next().map_or(false, |c| c.is_uppercase())
2101            }) => return TokenType::Focus(FocusKind::Just),
2102            "much" => return TokenType::Measure(MeasureKind::Much),
2103            "little" => return TokenType::Measure(MeasureKind::Little),
2104            _ => {}
2105        }
2106
2107        if lexicon::is_scopal_adverb(&lower) {
2108            let sym = self.interner.intern(&Self::capitalize(&lower));
2109            return TokenType::ScopalAdverb(sym);
2110        }
2111
2112        if lexicon::is_temporal_adverb(&lower) {
2113            let sym = self.interner.intern(&Self::capitalize(&lower));
2114            return TokenType::TemporalAdverb(sym);
2115        }
2116
2117        if lexicon::is_non_intersective(&lower) {
2118            let sym = self.interner.intern(&Self::capitalize(&lower));
2119            return TokenType::NonIntersectiveAdjective(sym);
2120        }
2121
2122        if lexicon::is_adverb(&lower) {
2123            let sym = self.interner.intern(&Self::capitalize(&lower));
2124            return TokenType::Adverb(sym);
2125        }
2126        if lower.ends_with("ly") && !lexicon::is_not_adverb(&lower) && lower.len() > 4 {
2127            let sym = self.interner.intern(&Self::capitalize(&lower));
2128            return TokenType::Adverb(sym);
2129        }
2130
2131        if let Some(base) = self.try_parse_superlative(&lower) {
2132            let sym = self.interner.intern(&base);
2133            return TokenType::Superlative(sym);
2134        }
2135
2136        // Handle irregular comparatives (less, more, better, worse)
2137        let irregular_comparative = match lower.as_str() {
2138            "less" => Some("Little"),
2139            "more" => Some("Much"),
2140            "better" => Some("Good"),
2141            "worse" => Some("Bad"),
2142            _ => None,
2143        };
2144        if let Some(base) = irregular_comparative {
2145            let sym = self.interner.intern(base);
2146            return TokenType::Comparative(sym);
2147        }
2148
2149        if let Some(base) = self.try_parse_comparative(&lower) {
2150            let sym = self.interner.intern(&base);
2151            return TokenType::Comparative(sym);
2152        }
2153
2154        if lexicon::is_performative(&lower) {
2155            let sym = self.interner.intern(&Self::capitalize(&lower));
2156            return TokenType::Performative(sym);
2157        }
2158
2159        if lexicon::is_base_verb_early(&lower) {
2160            let sym = self.interner.intern(&Self::capitalize(&lower));
2161            let class = lexicon::lookup_verb_class(&lower);
2162            return TokenType::Verb {
2163                lemma: sym,
2164                time: Time::Present,
2165                aspect: Aspect::Simple,
2166                class,
2167            };
2168        }
2169
2170        // Check for gerunds/progressive verbs BEFORE ProperName check
2171        // "Running" at start of sentence should be Verb, not ProperName
2172        if lower.ends_with("ing") && lower.len() > 4 {
2173            if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2174                let sym = self.interner.intern(&entry.lemma);
2175                return TokenType::Verb {
2176                    lemma: sym,
2177                    time: entry.time,
2178                    aspect: entry.aspect,
2179                    class: entry.class,
2180                };
2181            }
2182        }
2183
2184        if first_char.is_uppercase() {
2185            // Smart Lexicon: Check if this capitalized word is actually a common noun
2186            // Only apply for sentence-initial words (followed by verb) to avoid
2187            // breaking type definitions like "A Point has:"
2188            //
2189            // Pattern: "Farmers walk." → Farmers is plural of Farmer (common noun)
2190            // Pattern: "A Point has:" → Point is a type name (proper name)
2191            if let Some(next) = self.peek_word(1) {
2192                let next_lower = next.to_lowercase();
2193                // If next word is a verb, this capitalized word is likely a subject noun
2194                let is_followed_by_verb = self.lexicon.lookup_verb(&next_lower).is_some()
2195                    || matches!(next_lower.as_str(), "is" | "are" | "was" | "were" | "has" | "have" | "had");
2196
2197                if is_followed_by_verb {
2198                    // Check if lowercase version is a derivable common noun
2199                    if let Some(analysis) = lexicon::analyze_word(&lower) {
2200                        match analysis {
2201                            lexicon::WordAnalysis::Noun(meta) if meta.number == lexicon::Number::Plural => {
2202                                // It's a plural noun - definitely a common noun
2203                                let sym = self.interner.intern(&lower);
2204                                return TokenType::Noun(sym);
2205                            }
2206                            lexicon::WordAnalysis::DerivedNoun { number: lexicon::Number::Plural, .. } => {
2207                                // Derived plural agentive noun (e.g., "Bloggers")
2208                                let sym = self.interner.intern(&lower);
2209                                return TokenType::Noun(sym);
2210                            }
2211                            _ => {
2212                                // Singular nouns at sentence start could still be proper names
2213                                // e.g., "John walks." vs "Farmer walks."
2214                            }
2215                        }
2216                    }
2217                }
2218            }
2219
2220            let sym = self.interner.intern(word);
2221            return TokenType::ProperName(sym);
2222        }
2223
2224        let verb_entry = self.lexicon.lookup_verb(&lower);
2225        let is_noun = lexicon::is_common_noun(&lower);
2226        let is_adj = self.is_adjective_like(&lower);
2227        let is_disambiguated = lexicon::is_disambiguation_not_verb(&lower);
2228
2229        // Ambiguous: word is Verb AND (Noun OR Adjective), not disambiguated
2230        if verb_entry.is_some() && (is_noun || is_adj) && !is_disambiguated {
2231            let entry = verb_entry.unwrap();
2232            let verb_token = TokenType::Verb {
2233                lemma: self.interner.intern(&entry.lemma),
2234                time: entry.time,
2235                aspect: entry.aspect,
2236                class: entry.class,
2237            };
2238
2239            let mut alternatives = Vec::new();
2240            if is_noun {
2241                alternatives.push(TokenType::Noun(self.interner.intern(word)));
2242            }
2243            if is_adj {
2244                alternatives.push(TokenType::Adjective(self.interner.intern(word)));
2245            }
2246
2247            return TokenType::Ambiguous {
2248                primary: Box::new(verb_token),
2249                alternatives,
2250            };
2251        }
2252
2253        // Disambiguated to noun/adjective (not verb)
2254        if let Some(_) = &verb_entry {
2255            if is_disambiguated {
2256                let sym = self.interner.intern(word);
2257                if is_noun {
2258                    return TokenType::Noun(sym);
2259                }
2260                return TokenType::Adjective(sym);
2261            }
2262        }
2263
2264        // Pure verb
2265        if let Some(entry) = verb_entry {
2266            let sym = self.interner.intern(&entry.lemma);
2267            return TokenType::Verb {
2268                lemma: sym,
2269                time: entry.time,
2270                aspect: entry.aspect,
2271                class: entry.class,
2272            };
2273        }
2274
2275        // Pure noun
2276        if is_noun {
2277            let sym = self.interner.intern(word);
2278            return TokenType::Noun(sym);
2279        }
2280
2281        if lexicon::is_base_verb(&lower) {
2282            let sym = self.interner.intern(&Self::capitalize(&lower));
2283            let class = lexicon::lookup_verb_class(&lower);
2284            return TokenType::Verb {
2285                lemma: sym,
2286                time: Time::Present,
2287                aspect: Aspect::Simple,
2288                class,
2289            };
2290        }
2291
2292        if lower.ends_with("ian")
2293            || lower.ends_with("er")
2294            || lower == "logic"
2295            || lower == "time"
2296            || lower == "men"
2297            || lower == "book"
2298            || lower == "house"
2299            || lower == "code"
2300            || lower == "user"
2301        {
2302            let sym = self.interner.intern(word);
2303            return TokenType::Noun(sym);
2304        }
2305
2306        if lexicon::is_particle(&lower) {
2307            let sym = self.interner.intern(&lower);
2308            return TokenType::Particle(sym);
2309        }
2310
2311        let sym = self.interner.intern(word);
2312        TokenType::Adjective(sym)
2313    }
2314
2315    fn capitalize(s: &str) -> String {
2316        let mut chars = s.chars();
2317        match chars.next() {
2318            None => String::new(),
2319            Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
2320        }
2321    }
2322
2323    pub fn is_collective_verb(lemma: &str) -> bool {
2324        lexicon::is_collective_verb(&lemma.to_lowercase())
2325    }
2326
2327    pub fn is_mixed_verb(lemma: &str) -> bool {
2328        lexicon::is_mixed_verb(&lemma.to_lowercase())
2329    }
2330
2331    pub fn is_distributive_verb(lemma: &str) -> bool {
2332        lexicon::is_distributive_verb(&lemma.to_lowercase())
2333    }
2334
2335    pub fn is_intensional_predicate(lemma: &str) -> bool {
2336        lexicon::is_intensional_predicate(&lemma.to_lowercase())
2337    }
2338
2339    pub fn is_opaque_verb(lemma: &str) -> bool {
2340        lexicon::is_opaque_verb(&lemma.to_lowercase())
2341    }
2342
2343    pub fn is_ditransitive_verb(lemma: &str) -> bool {
2344        lexicon::is_ditransitive_verb(&lemma.to_lowercase())
2345    }
2346
2347    fn is_verb_like(&self, word: &str) -> bool {
2348        let lower = word.to_lowercase();
2349        if lexicon::is_infinitive_verb(&lower) {
2350            return true;
2351        }
2352        if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2353            return entry.lemma.len() > 0;
2354        }
2355        false
2356    }
2357
2358    pub fn is_subject_control_verb(lemma: &str) -> bool {
2359        lexicon::is_subject_control_verb(&lemma.to_lowercase())
2360    }
2361
2362    pub fn is_raising_verb(lemma: &str) -> bool {
2363        lexicon::is_raising_verb(&lemma.to_lowercase())
2364    }
2365
2366    pub fn is_object_control_verb(lemma: &str) -> bool {
2367        lexicon::is_object_control_verb(&lemma.to_lowercase())
2368    }
2369
2370    pub fn is_weather_verb(lemma: &str) -> bool {
2371        matches!(
2372            lemma.to_lowercase().as_str(),
2373            "rain" | "snow" | "hail" | "thunder" | "pour"
2374        )
2375    }
2376
2377    fn try_parse_superlative(&self, word: &str) -> Option<String> {
2378        if !word.ends_with("est") || word.len() < 5 {
2379            return None;
2380        }
2381
2382        let base = &word[..word.len() - 3];
2383
2384        if base.len() >= 2 {
2385            let chars: Vec<char> = base.chars().collect();
2386            let last = chars[chars.len() - 1];
2387            let second_last = chars[chars.len() - 2];
2388            if last == second_last && !"aeiou".contains(last) {
2389                let stem = &base[..base.len() - 1];
2390                if lexicon::is_gradable_adjective(stem) {
2391                    return Some(Self::capitalize(stem));
2392                }
2393            }
2394        }
2395
2396        if base.ends_with("i") {
2397            let stem = format!("{}y", &base[..base.len() - 1]);
2398            if lexicon::is_gradable_adjective(&stem) {
2399                return Some(Self::capitalize(&stem));
2400            }
2401        }
2402
2403        if lexicon::is_gradable_adjective(base) {
2404            return Some(Self::capitalize(base));
2405        }
2406
2407        None
2408    }
2409
2410    fn try_parse_comparative(&self, word: &str) -> Option<String> {
2411        if !word.ends_with("er") || word.len() < 4 {
2412            return None;
2413        }
2414
2415        let base = &word[..word.len() - 2];
2416
2417        if base.len() >= 2 {
2418            let chars: Vec<char> = base.chars().collect();
2419            let last = chars[chars.len() - 1];
2420            let second_last = chars[chars.len() - 2];
2421            if last == second_last && !"aeiou".contains(last) {
2422                let stem = &base[..base.len() - 1];
2423                if lexicon::is_gradable_adjective(stem) {
2424                    return Some(Self::capitalize(stem));
2425                }
2426            }
2427        }
2428
2429        if base.ends_with("i") {
2430            let stem = format!("{}y", &base[..base.len() - 1]);
2431            if lexicon::is_gradable_adjective(&stem) {
2432                return Some(Self::capitalize(&stem));
2433            }
2434        }
2435
2436        if lexicon::is_gradable_adjective(base) {
2437            return Some(Self::capitalize(base));
2438        }
2439
2440        None
2441    }
2442}
2443
2444#[cfg(test)]
2445mod tests {
2446    use super::*;
2447
2448    #[test]
2449    fn lexer_handles_apostrophe() {
2450        let mut interner = Interner::new();
2451        let mut lexer = Lexer::new("it's raining", &mut interner);
2452        let tokens = lexer.tokenize();
2453        assert!(!tokens.is_empty());
2454    }
2455
2456    #[test]
2457    fn lexer_handles_question_mark() {
2458        let mut interner = Interner::new();
2459        let mut lexer = Lexer::new("Is it raining?", &mut interner);
2460        let tokens = lexer.tokenize();
2461        assert!(!tokens.is_empty());
2462    }
2463
2464    #[test]
2465    fn ring_is_not_verb() {
2466        let mut interner = Interner::new();
2467        let mut lexer = Lexer::new("ring", &mut interner);
2468        let tokens = lexer.tokenize();
2469        assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2470    }
2471
2472    #[test]
2473    fn debug_that_token() {
2474        let mut interner = Interner::new();
2475        let mut lexer = Lexer::new("The cat that runs", &mut interner);
2476        let tokens = lexer.tokenize();
2477        for (i, t) in tokens.iter().enumerate() {
2478            let lex = interner.resolve(t.lexeme);
2479            eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2480        }
2481        let that_token = tokens.iter().find(|t| interner.resolve(t.lexeme) == "that");
2482        if let Some(t) = that_token {
2483            // Verify discriminant comparison works
2484            let check = std::mem::discriminant(&t.kind) == std::mem::discriminant(&TokenType::That);
2485            eprintln!("Discriminant check for That: {}", check);
2486            assert!(matches!(t.kind, TokenType::That), "'that' should be TokenType::That, got {:?}", t.kind);
2487        } else {
2488            panic!("No 'that' token found");
2489        }
2490    }
2491
2492    #[test]
2493    fn bus_is_not_verb() {
2494        let mut interner = Interner::new();
2495        let mut lexer = Lexer::new("bus", &mut interner);
2496        let tokens = lexer.tokenize();
2497        assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2498    }
2499
2500    #[test]
2501    fn lowercase_a_is_article() {
2502        let mut interner = Interner::new();
2503        let mut lexer = Lexer::new("a car", &mut interner);
2504        let tokens = lexer.tokenize();
2505        for (i, t) in tokens.iter().enumerate() {
2506            let lex = interner.resolve(t.lexeme);
2507            eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2508        }
2509        assert_eq!(tokens[0].kind, TokenType::Article(Definiteness::Indefinite));
2510        assert!(matches!(tokens[1].kind, TokenType::Noun(_)), "Expected Noun, got {:?}", tokens[1].kind);
2511    }
2512
2513    #[test]
2514    fn open_is_ambiguous() {
2515        let mut interner = Interner::new();
2516        let mut lexer = Lexer::new("open", &mut interner);
2517        let tokens = lexer.tokenize();
2518
2519        if let TokenType::Ambiguous { primary, alternatives } = &tokens[0].kind {
2520            assert!(matches!(**primary, TokenType::Verb { .. }), "Primary should be Verb");
2521            assert!(alternatives.iter().any(|t| matches!(t, TokenType::Adjective(_))),
2522                "Should have Adjective alternative");
2523        } else {
2524            panic!("Expected Ambiguous token for 'open', got {:?}", tokens[0].kind);
2525        }
2526    }
2527
2528    #[test]
2529    fn basic_tokenization() {
2530        let mut interner = Interner::new();
2531        let mut lexer = Lexer::new("All men are mortal.", &mut interner);
2532        let tokens = lexer.tokenize();
2533        assert_eq!(tokens[0].kind, TokenType::All);
2534        assert!(matches!(tokens[1].kind, TokenType::Noun(_)));
2535        assert_eq!(tokens[2].kind, TokenType::Are);
2536    }
2537
2538    #[test]
2539    fn iff_tokenizes_as_single_token() {
2540        let mut interner = Interner::new();
2541        let mut lexer = Lexer::new("A if and only if B", &mut interner);
2542        let tokens = lexer.tokenize();
2543        assert!(
2544            tokens.iter().any(|t| t.kind == TokenType::Iff),
2545            "should contain Iff token: got {:?}",
2546            tokens
2547        );
2548    }
2549
2550    #[test]
2551    fn is_equal_to_tokenizes_as_identity() {
2552        let mut interner = Interner::new();
2553        let mut lexer = Lexer::new("Socrates is equal to Socrates", &mut interner);
2554        let tokens = lexer.tokenize();
2555        assert!(
2556            tokens.iter().any(|t| t.kind == TokenType::Identity),
2557            "should contain Identity token: got {:?}",
2558            tokens
2559        );
2560    }
2561
2562    #[test]
2563    fn is_identical_to_tokenizes_as_identity() {
2564        let mut interner = Interner::new();
2565        let mut lexer = Lexer::new("Clark is identical to Superman", &mut interner);
2566        let tokens = lexer.tokenize();
2567        assert!(
2568            tokens.iter().any(|t| t.kind == TokenType::Identity),
2569            "should contain Identity token: got {:?}",
2570            tokens
2571        );
2572    }
2573
2574    #[test]
2575    fn itself_tokenizes_as_reflexive() {
2576        let mut interner = Interner::new();
2577        let mut lexer = Lexer::new("John loves itself", &mut interner);
2578        let tokens = lexer.tokenize();
2579        assert!(
2580            tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2581            "should contain Reflexive token: got {:?}",
2582            tokens
2583        );
2584    }
2585
2586    #[test]
2587    fn himself_tokenizes_as_reflexive() {
2588        let mut interner = Interner::new();
2589        let mut lexer = Lexer::new("John sees himself", &mut interner);
2590        let tokens = lexer.tokenize();
2591        assert!(
2592            tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2593            "should contain Reflexive token: got {:?}",
2594            tokens
2595        );
2596    }
2597
2598    #[test]
2599    fn to_stay_tokenizes_correctly() {
2600        let mut interner = Interner::new();
2601        let mut lexer = Lexer::new("to stay", &mut interner);
2602        let tokens = lexer.tokenize();
2603        assert!(
2604            tokens.iter().any(|t| t.kind == TokenType::To),
2605            "should contain To token: got {:?}",
2606            tokens
2607        );
2608        assert!(
2609            tokens.iter().any(|t| matches!(t.kind, TokenType::Verb { .. })),
2610            "should contain Verb token for stay: got {:?}",
2611            tokens
2612        );
2613    }
2614
2615    #[test]
2616    fn possessive_apostrophe_s() {
2617        let mut interner = Interner::new();
2618        let mut lexer = Lexer::new("John's dog", &mut interner);
2619        let tokens = lexer.tokenize();
2620        assert!(
2621            tokens.iter().any(|t| t.kind == TokenType::Possessive),
2622            "should contain Possessive token: got {:?}",
2623            tokens
2624        );
2625        assert!(
2626            tokens.iter().any(|t| matches!(&t.kind, TokenType::ProperName(_))),
2627            "should have John as proper name: got {:?}",
2628            tokens
2629        );
2630    }
2631
2632    #[test]
2633    fn lexer_produces_valid_spans() {
2634        let input = "All men are mortal.";
2635        let mut interner = Interner::new();
2636        let mut lexer = Lexer::new(input, &mut interner);
2637        let tokens = lexer.tokenize();
2638
2639        // "All" at 0..3
2640        assert_eq!(tokens[0].span.start, 0);
2641        assert_eq!(tokens[0].span.end, 3);
2642        assert_eq!(&input[tokens[0].span.start..tokens[0].span.end], "All");
2643
2644        // "men" at 4..7
2645        assert_eq!(tokens[1].span.start, 4);
2646        assert_eq!(tokens[1].span.end, 7);
2647        assert_eq!(&input[tokens[1].span.start..tokens[1].span.end], "men");
2648
2649        // "are" at 8..11
2650        assert_eq!(tokens[2].span.start, 8);
2651        assert_eq!(tokens[2].span.end, 11);
2652        assert_eq!(&input[tokens[2].span.start..tokens[2].span.end], "are");
2653
2654        // "mortal" at 12..18
2655        assert_eq!(tokens[3].span.start, 12);
2656        assert_eq!(tokens[3].span.end, 18);
2657        assert_eq!(&input[tokens[3].span.start..tokens[3].span.end], "mortal");
2658
2659        // "." at 18..19
2660        assert_eq!(tokens[4].span.start, 18);
2661        assert_eq!(tokens[4].span.end, 19);
2662
2663        // EOF at end
2664        assert_eq!(tokens[5].span.start, input.len());
2665        assert_eq!(tokens[5].kind, TokenType::EOF);
2666    }
2667}