logicaffeine_language/
lexer.rs

1//! Two-stage lexer for LOGOS natural language input.
2//!
3//! The lexer transforms natural language text into a token stream suitable
4//! for parsing. It operates in two stages:
5//!
6//! ## Stage 1: Line Lexer
7//!
8//! The [`LineLexer`] handles structural concerns:
9//!
10//! - **Indentation**: Tracks indent levels, emits `Indent`/`Dedent` tokens
11//! - **Block boundaries**: Identifies significant whitespace
12//! - **Content extraction**: Passes line content to Stage 2
13//!
14//! ## Stage 2: Word Lexer
15//!
16//! The [`Lexer`] performs word-level tokenization:
17//!
18//! - **Vocabulary lookup**: Identifies words via the lexicon database
19//! - **Morphological analysis**: Handles inflection (verb tenses, plurals)
20//! - **Ambiguity resolution**: Uses priority rules for ambiguous words
21//!
22//! ## Ambiguity Rules
23//!
24//! When a word matches multiple lexicon entries, priority determines the token:
25//!
26//! 1. **Quantifiers** over nouns ("some" → Quantifier, not Noun)
27//! 2. **Determiners** over adjectives ("the" → Determiner, not Adjective)
28//! 3. **Verbs** over nouns for -ing/-ed forms ("running" → Verb)
29//!
30//! ## Example
31//!
32//! ```text
33//! Input:  "Every cat sleeps."
34//! Output: [Quantifier("every"), Noun("cat"), Verb("sleeps"), Period]
35//! ```
36
37use logicaffeine_base::Interner;
38use crate::lexicon::{self, Aspect, Definiteness, Lexicon, Time};
39use crate::token::{BlockType, CalendarUnit, FocusKind, MeasureKind, Span, Token, TokenType};
40
41// ============================================================================
42// Stage 1: Line Lexer (Spec §2.5.2)
43// ============================================================================
44
45/// Tokens emitted by the LineLexer (Stage 1).
46/// Handles structural tokens (Indent, Dedent, Newline) while treating
47/// all other content as opaque for Stage 2 word classification.
48#[derive(Debug, Clone, PartialEq)]
49pub enum LineToken {
50    /// Block increased indentation
51    Indent,
52    /// Block decreased indentation
53    Dedent,
54    /// Logical newline (statement boundary) - reserved for future use
55    Newline,
56    /// Content to be further tokenized (line content, trimmed)
57    Content { text: String, start: usize, end: usize },
58}
59
60/// Stage 1 Lexer: Handles only lines, indentation, and structural tokens.
61/// Treats all other text as opaque `Content` for the Stage 2 WordLexer.
62pub struct LineLexer<'a> {
63    source: &'a str,
64    bytes: &'a [u8],
65    indent_stack: Vec<usize>,
66    pending_dedents: usize,
67    position: usize,
68    /// True if we need to emit Content for current line
69    has_pending_content: bool,
70    pending_content_start: usize,
71    pending_content_end: usize,
72    pending_content_text: String,
73    /// True after we've finished processing all lines
74    finished_lines: bool,
75    /// True if we've emitted at least one Indent (need to emit Dedents at EOF)
76    emitted_indent: bool,
77    /// Escape block body byte ranges to skip (start_byte, end_byte)
78    escape_body_ranges: Vec<(usize, usize)>,
79}
80
81impl<'a> LineLexer<'a> {
82    pub fn new(source: &'a str) -> Self {
83        Self {
84            source,
85            bytes: source.as_bytes(),
86            indent_stack: vec![0],
87            pending_dedents: 0,
88            position: 0,
89            has_pending_content: false,
90            pending_content_start: 0,
91            pending_content_end: 0,
92            pending_content_text: String::new(),
93            finished_lines: false,
94            emitted_indent: false,
95            escape_body_ranges: Vec::new(),
96        }
97    }
98
99    pub fn with_escape_ranges(source: &'a str, escape_body_ranges: Vec<(usize, usize)>) -> Self {
100        Self {
101            source,
102            bytes: source.as_bytes(),
103            indent_stack: vec![0],
104            pending_dedents: 0,
105            position: 0,
106            has_pending_content: false,
107            pending_content_start: 0,
108            pending_content_end: 0,
109            pending_content_text: String::new(),
110            finished_lines: false,
111            emitted_indent: false,
112            escape_body_ranges,
113        }
114    }
115
116    /// Check if a byte position falls within an escape body range.
117    fn is_in_escape_body(&self, pos: usize) -> bool {
118        self.escape_body_ranges.iter().any(|(start, end)| pos >= *start && pos < *end)
119    }
120
121    /// Calculate indentation level at current position (at start of line).
122    /// Returns (indent_level, content_start_pos).
123    fn measure_indent(&self, line_start: usize) -> (usize, usize) {
124        let mut indent = 0;
125        let mut pos = line_start;
126
127        while pos < self.bytes.len() {
128            match self.bytes[pos] {
129                b' ' => {
130                    indent += 1;
131                    pos += 1;
132                }
133                b'\t' => {
134                    indent += 4; // Tab = 4 spaces
135                    pos += 1;
136                }
137                _ => break,
138            }
139        }
140
141        (indent, pos)
142    }
143
144    /// Read content from current position until end of line or EOF.
145    /// Returns (content_text, content_start, content_end, next_line_start).
146    fn read_line_content(&self, content_start: usize) -> (String, usize, usize, usize) {
147        let mut pos = content_start;
148
149        // Find end of line
150        while pos < self.bytes.len() && self.bytes[pos] != b'\n' {
151            pos += 1;
152        }
153
154        let content_end = pos;
155        let text = self.source[content_start..content_end].trim_end().to_string();
156
157        // Move past newline if present
158        let next_line_start = if pos < self.bytes.len() && self.bytes[pos] == b'\n' {
159            pos + 1
160        } else {
161            pos
162        };
163
164        (text, content_start, content_end, next_line_start)
165    }
166
167    /// Check if the line starting at `pos` is blank (only whitespace).
168    fn is_blank_line(&self, line_start: usize) -> bool {
169        let mut pos = line_start;
170        while pos < self.bytes.len() {
171            match self.bytes[pos] {
172                b' ' | b'\t' => pos += 1,
173                b'\n' => return true,
174                _ => return false,
175            }
176        }
177        true // EOF counts as blank
178    }
179
180    /// Process the next line and update internal state.
181    /// Returns true if we have tokens to emit, false if we're done.
182    fn process_next_line(&mut self) -> bool {
183        // Skip blank lines
184        while self.position < self.bytes.len() && self.is_blank_line(self.position) {
185            // Skip to next line
186            while self.position < self.bytes.len() && self.bytes[self.position] != b'\n' {
187                self.position += 1;
188            }
189            if self.position < self.bytes.len() {
190                self.position += 1; // Skip the newline
191            }
192        }
193
194        // Check if we've reached EOF
195        if self.position >= self.bytes.len() {
196            self.finished_lines = true;
197            // Emit remaining dedents at EOF
198            if self.indent_stack.len() > 1 {
199                self.pending_dedents = self.indent_stack.len() - 1;
200                self.indent_stack.truncate(1);
201            }
202            return self.pending_dedents > 0;
203        }
204
205        // Measure indentation of current line
206        let (line_indent, content_start) = self.measure_indent(self.position);
207
208        // Read line content
209        let (text, start, end, next_pos) = self.read_line_content(content_start);
210
211        // Skip if content is empty (shouldn't happen after blank line skip, but be safe)
212        if text.is_empty() {
213            self.position = next_pos;
214            return self.process_next_line();
215        }
216
217        let current_indent = *self.indent_stack.last().unwrap();
218
219        // Handle indentation changes
220        if line_indent > current_indent {
221            // Indent: push new level
222            self.indent_stack.push(line_indent);
223            self.emitted_indent = true;
224            // Store content to emit after Indent
225            self.has_pending_content = true;
226            self.pending_content_text = text;
227            self.pending_content_start = start;
228            self.pending_content_end = end;
229            self.position = next_pos;
230            // We'll emit Indent first, then Content
231            return true;
232        } else if line_indent < current_indent {
233            // Dedent: pop until we match
234            while self.indent_stack.len() > 1 {
235                let top = *self.indent_stack.last().unwrap();
236                if line_indent < top {
237                    self.indent_stack.pop();
238                    self.pending_dedents += 1;
239                } else {
240                    break;
241                }
242            }
243            // Store content to emit after Dedents
244            self.has_pending_content = true;
245            self.pending_content_text = text;
246            self.pending_content_start = start;
247            self.pending_content_end = end;
248            self.position = next_pos;
249            return true;
250        } else {
251            // Same indentation level
252            self.has_pending_content = true;
253            self.pending_content_text = text;
254            self.pending_content_start = start;
255            self.pending_content_end = end;
256            self.position = next_pos;
257            return true;
258        }
259    }
260}
261
262impl<'a> Iterator for LineLexer<'a> {
263    type Item = LineToken;
264
265    fn next(&mut self) -> Option<LineToken> {
266        // 1. Emit pending dedents first
267        if self.pending_dedents > 0 {
268            self.pending_dedents -= 1;
269            return Some(LineToken::Dedent);
270        }
271
272        // 2. Emit pending content
273        if self.has_pending_content {
274            self.has_pending_content = false;
275            let text = std::mem::take(&mut self.pending_content_text);
276            let start = self.pending_content_start;
277            let end = self.pending_content_end;
278            return Some(LineToken::Content { text, start, end });
279        }
280
281        // 3. Check if we need to emit Indent (after pushing to stack)
282        // This happens when we detected an indent but haven't emitted the token yet
283        // We need to check if indent_stack was just modified
284
285        // 4. Process next line
286        if !self.finished_lines {
287            let had_indent = self.indent_stack.len();
288            if self.process_next_line() {
289                // Check if we added an indent level
290                if self.indent_stack.len() > had_indent {
291                    return Some(LineToken::Indent);
292                }
293                // Check if we have pending dedents
294                if self.pending_dedents > 0 {
295                    self.pending_dedents -= 1;
296                    return Some(LineToken::Dedent);
297                }
298                // Otherwise emit content
299                if self.has_pending_content {
300                    self.has_pending_content = false;
301                    let text = std::mem::take(&mut self.pending_content_text);
302                    let start = self.pending_content_start;
303                    let end = self.pending_content_end;
304                    return Some(LineToken::Content { text, start, end });
305                }
306            } else if self.pending_dedents > 0 {
307                // EOF with pending dedents
308                self.pending_dedents -= 1;
309                return Some(LineToken::Dedent);
310            }
311        }
312
313        // 5. Emit any remaining dedents at EOF
314        if self.pending_dedents > 0 {
315            self.pending_dedents -= 1;
316            return Some(LineToken::Dedent);
317        }
318
319        None
320    }
321}
322
323// ============================================================================
324// Stage 2: Word Lexer (existing Lexer)
325// ============================================================================
326
327#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
328pub enum LexerMode {
329    #[default]
330    Declarative, // Logic, Theorems, Definitions
331    Imperative,  // Main, Functions, Code
332}
333
334pub struct Lexer<'a> {
335    words: Vec<WordItem>,
336    pos: usize,
337    lexicon: Lexicon,
338    interner: &'a mut Interner,
339    input_len: usize,
340    in_let_context: bool,
341    mode: LexerMode,
342    source: String,
343    /// Escape block body byte ranges: (skip_start, skip_end) for filtering LineLexer events
344    escape_body_ranges: Vec<(usize, usize)>,
345}
346
347struct WordItem {
348    word: String,
349    trailing_punct: Option<char>,
350    start: usize,
351    end: usize,
352    punct_pos: Option<usize>,
353}
354
355impl<'a> Lexer<'a> {
356    /// Creates a new lexer for the given input text.
357    ///
358    /// The lexer will tokenize natural language text according to the
359    /// lexicon database, performing morphological analysis and ambiguity
360    /// resolution.
361    ///
362    /// # Arguments
363    ///
364    /// * `input` - The natural language text to tokenize
365    /// * `interner` - String interner for efficient symbol handling
366    ///
367    /// # Example
368    ///
369    /// ```
370    /// use logicaffeine_language::lexer::Lexer;
371    /// use logicaffeine_base::Interner;
372    ///
373    /// let mut interner = Interner::new();
374    /// let mut lexer = Lexer::new("Every cat sleeps.", &mut interner);
375    /// let tokens = lexer.tokenize();
376    ///
377    /// assert_eq!(tokens.len(), 5); // Quantifier, Noun, Verb, Period, EOI
378    /// ```
379    pub fn new(input: &str, interner: &'a mut Interner) -> Self {
380        let escape_ranges = Self::find_escape_block_ranges(input);
381        let escape_body_ranges: Vec<(usize, usize)> = escape_ranges.iter()
382            .map(|(_, end, content_start, _)| (*content_start, *end))
383            .collect();
384        let words = Self::split_into_words(input, &escape_ranges);
385        let input_len = input.len();
386
387        Lexer {
388            words,
389            pos: 0,
390            lexicon: Lexicon::new(),
391            interner,
392            input_len,
393            in_let_context: false,
394            mode: LexerMode::Declarative,
395            source: input.to_string(),
396            escape_body_ranges,
397        }
398    }
399
400    /// Pre-scan source text for escape block bodies.
401    /// Returns (skip_start_byte, skip_end_byte, content_start_byte, raw_code) tuples.
402    /// `skip_start` is the line start (for byte skipping in split_into_words).
403    /// `content_start` is after leading whitespace (for token span alignment with Indent events).
404    fn find_escape_block_ranges(source: &str) -> Vec<(usize, usize, usize, String)> {
405        let mut ranges = Vec::new();
406        let lines: Vec<&str> = source.split('\n').collect();
407        let mut line_starts: Vec<usize> = Vec::with_capacity(lines.len());
408        let mut pos = 0;
409        for line in &lines {
410            line_starts.push(pos);
411            pos += line.len() + 1; // +1 for the newline
412        }
413
414        let mut i = 0;
415        while i < lines.len() {
416            let trimmed = lines[i].trim();
417            // Check if this line contains an escape header: "Escape to Rust:"
418            // Matches both statement position (whole line) and expression position
419            // (e.g., "Let x: Int be Escape to Rust:")
420            let lower = trimmed.to_lowercase();
421            if lower == "escape to rust:" ||
422               lower.ends_with(" escape to rust:") ||
423               (lower.starts_with("escape to ") && lower.ends_with(':'))
424            {
425                // Find the body: subsequent lines with deeper indentation
426                let header_indent = Self::measure_indent_static(lines[i]);
427                i += 1;
428
429                // Skip blank lines to find the first body line
430                let mut body_start_line = i;
431                while body_start_line < lines.len() && lines[body_start_line].trim().is_empty() {
432                    body_start_line += 1;
433                }
434
435                if body_start_line >= lines.len() {
436                    // No body found
437                    continue;
438                }
439
440                let base_indent = Self::measure_indent_static(lines[body_start_line]);
441                if base_indent <= header_indent {
442                    // No indented body
443                    continue;
444                }
445
446                // Capture all lines at base_indent or deeper
447                let body_byte_start = line_starts[body_start_line];
448                let mut body_end_line = body_start_line;
449                let mut code_lines: Vec<String> = Vec::new();
450
451                let mut j = body_start_line;
452                while j < lines.len() {
453                    let line = lines[j];
454                    if line.trim().is_empty() {
455                        // Blank lines are preserved
456                        code_lines.push(String::new());
457                        body_end_line = j;
458                        j += 1;
459                        continue;
460                    }
461                    let line_indent = Self::measure_indent_static(line);
462                    if line_indent < base_indent {
463                        break;
464                    }
465                    // Strip base indentation
466                    let stripped = Self::strip_indent(line, base_indent);
467                    code_lines.push(stripped);
468                    body_end_line = j;
469                    j += 1;
470                }
471
472                // Trim trailing empty lines from code
473                while code_lines.last().map_or(false, |l| l.is_empty()) {
474                    code_lines.pop();
475                }
476
477                if !code_lines.is_empty() {
478                    let body_byte_end = if body_end_line + 1 < lines.len() {
479                        line_starts[body_end_line + 1]
480                    } else {
481                        source.len()
482                    };
483                    // Compute content start (after leading whitespace of first body line)
484                    let content_start = body_byte_start + Self::leading_whitespace_bytes(lines[body_start_line]);
485                    let raw_code = code_lines.join("\n");
486                    ranges.push((body_byte_start, body_byte_end, content_start, raw_code));
487                }
488
489                i = j;
490            } else {
491                i += 1;
492            }
493        }
494
495        ranges
496    }
497
498    /// Count leading whitespace bytes in a line.
499    fn leading_whitespace_bytes(line: &str) -> usize {
500        let mut count = 0;
501        for c in line.chars() {
502            match c {
503                ' ' | '\t' => count += c.len_utf8(),
504                _ => break,
505            }
506        }
507        count
508    }
509
510    /// Measure indent of a line (static helper for pre-scan).
511    fn measure_indent_static(line: &str) -> usize {
512        let mut indent = 0;
513        for c in line.chars() {
514            match c {
515                ' ' => indent += 1,
516                '\t' => indent += 4,
517                _ => break,
518            }
519        }
520        indent
521    }
522
523    /// Strip `count` leading spaces/tabs from a line.
524    fn strip_indent(line: &str, count: usize) -> String {
525        let mut stripped = 0;
526        let mut byte_pos = 0;
527        for (i, c) in line.char_indices() {
528            if stripped >= count {
529                byte_pos = i;
530                break;
531            }
532            match c {
533                ' ' => { stripped += 1; byte_pos = i + 1; }
534                '\t' => { stripped += 4; byte_pos = i + 1; }
535                _ => { byte_pos = i; break; }
536            }
537        }
538        if stripped < count {
539            byte_pos = line.len();
540        }
541        line[byte_pos..].to_string()
542    }
543
544    fn split_into_words(input: &str, escape_ranges: &[(usize, usize, usize, String)]) -> Vec<WordItem> {
545        let mut items = Vec::new();
546        let mut current_word = String::new();
547        let mut word_start = 0;
548        let chars: Vec<char> = input.chars().collect();
549        let mut char_idx = 0;
550        let mut skip_count = 0;
551        // Track byte offset for escape range matching
552        let mut skip_to_byte: Option<usize> = None;
553
554        for (i, c) in input.char_indices() {
555            if skip_count > 0 {
556                skip_count -= 1;
557                char_idx += 1;
558                continue;
559            }
560            // Skip bytes inside escape block bodies
561            if let Some(end) = skip_to_byte {
562                if i < end {
563                    char_idx += 1;
564                    continue;
565                }
566                skip_to_byte = None;
567                word_start = i;
568            }
569            // Check if this byte position starts an escape block body
570            if let Some((_, end, content_start, raw_code)) = escape_ranges.iter().find(|(s, _, _, _)| i == *s) {
571                // Flush any pending word
572                if !current_word.is_empty() {
573                    items.push(WordItem {
574                        word: std::mem::take(&mut current_word),
575                        trailing_punct: None,
576                        start: word_start,
577                        end: i,
578                        punct_pos: None,
579                    });
580                }
581                // Emit the entire block as a single \x00ESC: marker
582                // Use content_start (after whitespace) for span alignment with Indent events
583                items.push(WordItem {
584                    word: format!("\x00ESC:{}", raw_code),
585                    trailing_punct: None,
586                    start: *content_start,
587                    end: *end,
588                    punct_pos: None,
589                });
590                skip_to_byte = Some(*end);
591                word_start = *end;
592                char_idx += 1;
593                continue;
594            }
595            let next_pos = i + c.len_utf8();
596            match c {
597                ' ' | '\t' | '\n' | '\r' => {
598                    if !current_word.is_empty() {
599                        items.push(WordItem {
600                            word: std::mem::take(&mut current_word),
601                            trailing_punct: None,
602                            start: word_start,
603                            end: i,
604                            punct_pos: None,
605                        });
606                    }
607                    word_start = next_pos;
608                }
609                '.' => {
610                    // Check if this is a decimal point (digit before and after)
611                    let prev_is_digit = !current_word.is_empty()
612                        && current_word.chars().last().map_or(false, |ch| ch.is_ascii_digit());
613                    let next_is_digit = char_idx + 1 < chars.len()
614                        && chars[char_idx + 1].is_ascii_digit();
615
616                    if prev_is_digit && next_is_digit {
617                        // This is a decimal point, include it in the current word
618                        current_word.push(c);
619                    } else {
620                        // This is a sentence period
621                        if !current_word.is_empty() {
622                            items.push(WordItem {
623                                word: std::mem::take(&mut current_word),
624                                trailing_punct: Some(c),
625                                start: word_start,
626                                end: i,
627                                punct_pos: Some(i),
628                            });
629                        } else {
630                            items.push(WordItem {
631                                word: String::new(),
632                                trailing_punct: Some(c),
633                                start: i,
634                                end: next_pos,
635                                punct_pos: Some(i),
636                            });
637                        }
638                        word_start = next_pos;
639                    }
640                }
641                '#' => {
642                    // Check for ## block header (markdown-style)
643                    if char_idx + 1 < chars.len() && chars[char_idx + 1] == '#' {
644                        // This is a ## block header
645                        // Skip the second # and capture the next word as a block header
646                        if !current_word.is_empty() {
647                            items.push(WordItem {
648                                word: std::mem::take(&mut current_word),
649                                trailing_punct: None,
650                                start: word_start,
651                                end: i,
652                                punct_pos: None,
653                            });
654                        }
655                        // Skip whitespace after ##
656                        let header_start = i;
657                        let mut j = char_idx + 2;
658                        while j < chars.len() && (chars[j] == ' ' || chars[j] == '\t') {
659                            j += 1;
660                        }
661                        // Capture the block type word
662                        let mut block_word = String::from("##");
663                        while j < chars.len() && chars[j].is_alphabetic() {
664                            block_word.push(chars[j]);
665                            j += 1;
666                        }
667                        if block_word.len() > 2 {
668                            items.push(WordItem {
669                                word: block_word,
670                                trailing_punct: None,
671                                start: header_start,
672                                end: header_start + (j - char_idx),
673                                punct_pos: None,
674                            });
675                        }
676                        skip_count = j - char_idx - 1;
677                        word_start = header_start + (j - char_idx);
678                    } else {
679                        // Single # - treat as comment, skip to end of line
680                        // Count how many chars to skip (without modifying char_idx here -
681                        // the main loop's skip handler will increment it)
682                        let mut look_ahead = char_idx + 1;
683                        while look_ahead < chars.len() && chars[look_ahead] != '\n' {
684                            skip_count += 1;
685                            look_ahead += 1;
686                        }
687                        if !current_word.is_empty() {
688                            items.push(WordItem {
689                                word: std::mem::take(&mut current_word),
690                                trailing_punct: None,
691                                start: word_start,
692                                end: i,
693                                punct_pos: None,
694                            });
695                        }
696                        word_start = look_ahead + 1; // Start after the newline
697                    }
698                }
699                // String literals: "hello world" or """multi-line"""
700                '"' => {
701                    // Push any pending word
702                    if !current_word.is_empty() {
703                        items.push(WordItem {
704                            word: std::mem::take(&mut current_word),
705                            trailing_punct: None,
706                            start: word_start,
707                            end: i,
708                            punct_pos: None,
709                        });
710                    }
711
712                    // Check for triple-quote: """
713                    if char_idx + 2 < chars.len() && chars[char_idx + 1] == '"' && chars[char_idx + 2] == '"' {
714                        let string_start = i;
715                        let mut j = char_idx + 3; // skip opening """
716                        // Skip optional newline after opening """
717                        if j < chars.len() && chars[j] == '\n' {
718                            j += 1;
719                        }
720                        let mut raw_content = String::new();
721                        // Scan until closing """
722                        while j < chars.len() {
723                            if j + 2 < chars.len() && chars[j] == '"' && chars[j + 1] == '"' && chars[j + 2] == '"' {
724                                break;
725                            }
726                            raw_content.push(chars[j]);
727                            j += 1;
728                        }
729                        // Strip trailing newline before closing """
730                        if raw_content.ends_with('\n') {
731                            raw_content.pop();
732                        }
733                        // Dedent: find minimum common indentation and strip it
734                        let dedented = Self::dedent_triple_quote(&raw_content);
735                        let end_pos = if j + 2 < chars.len() { j + 3 } else { chars.len() };
736                        items.push(WordItem {
737                            word: format!("\x00STR:{}", dedented),
738                            trailing_punct: None,
739                            start: string_start,
740                            end: end_pos,
741                            punct_pos: None,
742                        });
743                        // Skip past the closing """
744                        if j + 2 < chars.len() {
745                            skip_count = (j + 2) - char_idx;
746                        } else {
747                            skip_count = chars.len() - 1 - char_idx;
748                        }
749                        word_start = end_pos;
750                    } else {
751                        // Single-quoted string: scan until closing quote
752                        let string_start = i;
753                        let mut j = char_idx + 1;
754                        let mut string_content = String::new();
755                        while j < chars.len() && chars[j] != '"' {
756                            if chars[j] == '\\' && j + 1 < chars.len() {
757                                // Escape sequence - skip backslash, include next char
758                                j += 1;
759                                if j < chars.len() {
760                                    string_content.push(chars[j]);
761                                }
762                            } else {
763                                string_content.push(chars[j]);
764                            }
765                            j += 1;
766                        }
767
768                        // Create a special marker for string literals
769                        // We prefix with a special character to identify in tokenize()
770                        items.push(WordItem {
771                            word: format!("\x00STR:{}", string_content),
772                            trailing_punct: None,
773                            start: string_start,
774                            end: if j < chars.len() { j + 1 } else { j },
775                            punct_pos: None,
776                        });
777
778                        // Skip past the closing quote
779                        if j < chars.len() {
780                            skip_count = j - char_idx;
781                        } else {
782                            skip_count = j - char_idx - 1;
783                        }
784                        word_start = if j < chars.len() { j + 1 } else { j };
785                    }
786                }
787                // Character literals with backticks: `x`
788                '`' => {
789                    // Push any pending word
790                    if !current_word.is_empty() {
791                        items.push(WordItem {
792                            word: std::mem::take(&mut current_word),
793                            trailing_punct: None,
794                            start: word_start,
795                            end: i,
796                            punct_pos: None,
797                        });
798                    }
799
800                    // Scan for character content and closing backtick
801                    let char_start = i;
802                    let mut j = char_idx + 1;
803                    let mut char_content = String::new();
804
805                    if j < chars.len() {
806                        if chars[j] == '\\' && j + 1 < chars.len() {
807                            // Escape sequence
808                            j += 1;
809                            let escaped_char = match chars[j] {
810                                'n' => '\n',
811                                't' => '\t',
812                                'r' => '\r',
813                                '\\' => '\\',
814                                '`' => '`',
815                                '0' => '\0',
816                                c => c,
817                            };
818                            char_content.push(escaped_char);
819                            j += 1;
820                        } else if chars[j] != '`' {
821                            // Regular character
822                            char_content.push(chars[j]);
823                            j += 1;
824                        }
825                    }
826
827                    // Expect closing backtick
828                    if j < chars.len() && chars[j] == '`' {
829                        j += 1; // skip closing backtick
830                    }
831
832                    // Create a special marker for char literals
833                    items.push(WordItem {
834                        word: format!("\x00CHAR:{}", char_content),
835                        trailing_punct: None,
836                        start: char_start,
837                        end: if j <= chars.len() { char_start + (j - char_idx) } else { char_start + 1 },
838                        punct_pos: None,
839                    });
840
841                    if j > char_idx + 1 {
842                        skip_count = j - char_idx - 1;
843                    }
844                    word_start = char_start + (j - char_idx);
845                }
846                // Handle -> as a single token for return type syntax
847                '-' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '>' => {
848                    // Push any pending word first
849                    if !current_word.is_empty() {
850                        items.push(WordItem {
851                            word: std::mem::take(&mut current_word),
852                            trailing_punct: None,
853                            start: word_start,
854                            end: i,
855                            punct_pos: None,
856                        });
857                    }
858                    // Push -> as its own word
859                    items.push(WordItem {
860                        word: "->".to_string(),
861                        trailing_punct: None,
862                        start: i,
863                        end: i + 2,
864                        punct_pos: None,
865                    });
866                    skip_count = 1; // Skip the '>' character
867                    word_start = i + 2;
868                }
869                // Grand Challenge: Handle <= as a single token
870                '<' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
871                    if !current_word.is_empty() {
872                        items.push(WordItem {
873                            word: std::mem::take(&mut current_word),
874                            trailing_punct: None,
875                            start: word_start,
876                            end: i,
877                            punct_pos: None,
878                        });
879                    }
880                    items.push(WordItem {
881                        word: "<=".to_string(),
882                        trailing_punct: None,
883                        start: i,
884                        end: i + 2,
885                        punct_pos: None,
886                    });
887                    skip_count = 1;
888                    word_start = i + 2;
889                }
890                // Grand Challenge: Handle >= as a single token
891                '>' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
892                    if !current_word.is_empty() {
893                        items.push(WordItem {
894                            word: std::mem::take(&mut current_word),
895                            trailing_punct: None,
896                            start: word_start,
897                            end: i,
898                            punct_pos: None,
899                        });
900                    }
901                    items.push(WordItem {
902                        word: ">=".to_string(),
903                        trailing_punct: None,
904                        start: i,
905                        end: i + 2,
906                        punct_pos: None,
907                    });
908                    skip_count = 1;
909                    word_start = i + 2;
910                }
911                // Handle == as a single token
912                '=' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
913                    if !current_word.is_empty() {
914                        items.push(WordItem {
915                            word: std::mem::take(&mut current_word),
916                            trailing_punct: None,
917                            start: word_start,
918                            end: i,
919                            punct_pos: None,
920                        });
921                    }
922                    items.push(WordItem {
923                        word: "==".to_string(),
924                        trailing_punct: None,
925                        start: i,
926                        end: i + 2,
927                        punct_pos: None,
928                    });
929                    skip_count = 1;
930                    word_start = i + 2;
931                }
932                // Handle != as a single token
933                '!' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
934                    if !current_word.is_empty() {
935                        items.push(WordItem {
936                            word: std::mem::take(&mut current_word),
937                            trailing_punct: None,
938                            start: word_start,
939                            end: i,
940                            punct_pos: None,
941                        });
942                    }
943                    items.push(WordItem {
944                        word: "!=".to_string(),
945                        trailing_punct: None,
946                        start: i,
947                        end: i + 2,
948                        punct_pos: None,
949                    });
950                    skip_count = 1;
951                    word_start = i + 2;
952                }
953                // Special handling for '-' in ISO-8601 dates (YYYY-MM-DD)
954                '-' if Self::is_date_hyphen(&current_word, &chars, char_idx) => {
955                    // This hyphen is part of a date, include it in the word
956                    current_word.push(c);
957                }
958                // Special handling for ':' in time literals (9:30am, 11:45pm)
959                ':' if Self::is_time_colon(&current_word, &chars, char_idx) => {
960                    // This colon is part of a time, include it in the word
961                    current_word.push(c);
962                }
963                // Scientific notation: 4.84e+00, 1.66E-03, 2.5e-2
964                '+' | '-' if Self::is_exponent_sign(&current_word, &chars, char_idx) => {
965                    current_word.push(c);
966                }
967                '(' | ')' | '[' | ']' | ',' | '?' | '!' | ':' | '+' | '-' | '*' | '/' | '%' | '<' | '>' | '=' => {
968                    if !current_word.is_empty() {
969                        items.push(WordItem {
970                            word: std::mem::take(&mut current_word),
971                            trailing_punct: Some(c),
972                            start: word_start,
973                            end: i,
974                            punct_pos: Some(i),
975                        });
976                    } else {
977                        items.push(WordItem {
978                            word: String::new(),
979                            trailing_punct: Some(c),
980                            start: i,
981                            end: next_pos,
982                            punct_pos: Some(i),
983                        });
984                    }
985                    word_start = next_pos;
986                }
987                '\'' => {
988                    // Handle contractions: expand "don't" → "do" + "not", etc.
989                    let remaining: String = chars[char_idx + 1..].iter().collect();
990                    let remaining_lower = remaining.to_lowercase();
991
992                    if remaining_lower.starts_with("t ") || remaining_lower.starts_with("t.") ||
993                       remaining_lower.starts_with("t,") || remaining_lower == "t" ||
994                       (char_idx + 1 < chars.len() && chars[char_idx + 1] == 't' &&
995                        (char_idx + 2 >= chars.len() || !chars[char_idx + 2].is_alphabetic())) {
996                        // This is a contraction ending in 't (don't, doesn't, won't, can't, etc.)
997                        let word_lower = current_word.to_lowercase();
998                        if word_lower == "don" || word_lower == "doesn" || word_lower == "didn" {
999                            // do/does/did + not
1000                            let base = if word_lower == "don" { "do" }
1001                                      else if word_lower == "doesn" { "does" }
1002                                      else { "did" };
1003                            items.push(WordItem {
1004                                word: base.to_string(),
1005                                trailing_punct: None,
1006                                start: word_start,
1007                                end: i,
1008                                punct_pos: None,
1009                            });
1010                            items.push(WordItem {
1011                                word: "not".to_string(),
1012                                trailing_punct: None,
1013                                start: i,
1014                                end: i + 2,
1015                                punct_pos: None,
1016                            });
1017                            current_word.clear();
1018                            word_start = next_pos + 1;
1019                            skip_count = 1;
1020                        } else if word_lower == "won" {
1021                            // will + not
1022                            items.push(WordItem {
1023                                word: "will".to_string(),
1024                                trailing_punct: None,
1025                                start: word_start,
1026                                end: i,
1027                                punct_pos: None,
1028                            });
1029                            items.push(WordItem {
1030                                word: "not".to_string(),
1031                                trailing_punct: None,
1032                                start: i,
1033                                end: i + 2,
1034                                punct_pos: None,
1035                            });
1036                            current_word.clear();
1037                            word_start = next_pos + 1;
1038                            skip_count = 1;
1039                        } else if word_lower == "can" {
1040                            // cannot
1041                            items.push(WordItem {
1042                                word: "cannot".to_string(),
1043                                trailing_punct: None,
1044                                start: word_start,
1045                                end: i + 2,
1046                                punct_pos: None,
1047                            });
1048                            current_word.clear();
1049                            word_start = next_pos + 1;
1050                            skip_count = 1;
1051                        } else {
1052                            // Unknown contraction, split normally
1053                            if !current_word.is_empty() {
1054                                items.push(WordItem {
1055                                    word: std::mem::take(&mut current_word),
1056                                    trailing_punct: Some('\''),
1057                                    start: word_start,
1058                                    end: i,
1059                                    punct_pos: Some(i),
1060                                });
1061                            }
1062                            word_start = next_pos;
1063                        }
1064                    } else {
1065                        // Not a 't contraction, handle normally
1066                        if !current_word.is_empty() {
1067                            items.push(WordItem {
1068                                word: std::mem::take(&mut current_word),
1069                                trailing_punct: Some('\''),
1070                                start: word_start,
1071                                end: i,
1072                                punct_pos: Some(i),
1073                            });
1074                        }
1075                        word_start = next_pos;
1076                    }
1077                }
1078                c if c.is_alphabetic() || c.is_ascii_digit() || (c == '.' && !current_word.is_empty() && current_word.chars().all(|ch| ch.is_ascii_digit())) || c == '_' => {
1079                    if current_word.is_empty() {
1080                        word_start = i;
1081                    }
1082                    current_word.push(c);
1083                }
1084                _ => {
1085                    word_start = next_pos;
1086                }
1087            }
1088            char_idx += 1;
1089        }
1090
1091        if !current_word.is_empty() {
1092            items.push(WordItem {
1093                word: current_word,
1094                trailing_punct: None,
1095                start: word_start,
1096                end: input.len(),
1097                punct_pos: None,
1098            });
1099        }
1100
1101        items
1102    }
1103
1104    fn peek_word(&self, offset: usize) -> Option<&str> {
1105        self.words.get(self.pos + offset).map(|w| w.word.as_str())
1106    }
1107
1108    fn peek_sequence(&self, expected: &[&str]) -> bool {
1109        for (i, &exp) in expected.iter().enumerate() {
1110            match self.peek_word(i + 1) {
1111                Some(w) if w.to_lowercase() == exp => continue,
1112                _ => return false,
1113            }
1114        }
1115        true
1116    }
1117
1118    fn consume_words(&mut self, count: usize) {
1119        self.pos += count;
1120    }
1121
1122    /// Tokenizes the input text and returns a vector of [`Token`]s.
1123    ///
1124    /// Each token includes its type, the interned lexeme, and the source
1125    /// span for error reporting. Words are classified according to the
1126    /// lexicon database with priority-based ambiguity resolution.
1127    ///
1128    /// # Returns
1129    ///
1130    /// A vector of tokens representing the input. The final token is
1131    /// typically `TokenType::Eof`.
1132    pub fn tokenize(&mut self) -> Vec<Token> {
1133        let mut tokens = Vec::new();
1134
1135        while self.pos < self.words.len() {
1136            let item = &self.words[self.pos];
1137            let word = item.word.clone();
1138            let trailing_punct = item.trailing_punct;
1139            let word_start = item.start;
1140            let word_end = item.end;
1141            let punct_pos = item.punct_pos;
1142
1143            if word.is_empty() {
1144                if let Some(punct) = trailing_punct {
1145                    let kind = match punct {
1146                        '(' => TokenType::LParen,
1147                        ')' => TokenType::RParen,
1148                        '[' => TokenType::LBracket,
1149                        ']' => TokenType::RBracket,
1150                        ',' => TokenType::Comma,
1151                        ':' => TokenType::Colon,
1152                        '.' | '?' => {
1153                            self.in_let_context = false;
1154                            TokenType::Period
1155                        }
1156                        '!' => TokenType::Exclamation,
1157                        '+' => TokenType::Plus,
1158                        '-' => TokenType::Minus,
1159                        '*' => TokenType::Star,
1160                        '/' => TokenType::Slash,
1161                        '%' => TokenType::Percent,
1162                        '<' => TokenType::Lt,
1163                        '>' => TokenType::Gt,
1164                        '=' => TokenType::Assign,
1165                        _ => {
1166                            self.pos += 1;
1167                            continue;
1168                        }
1169                    };
1170                    let lexeme = self.interner.intern(&punct.to_string());
1171                    let span = Span::new(word_start, word_end);
1172                    tokens.push(Token::new(kind, lexeme, span));
1173                }
1174                self.pos += 1;
1175                continue;
1176            }
1177
1178            // Check for string literal marker (pre-tokenized in Stage 1)
1179            if word.starts_with("\x00STR:") {
1180                let content = &word[5..]; // Skip the marker prefix
1181                let span = Span::new(word_start, word_end);
1182                if Self::has_unescaped_brace(content) {
1183                    let sym = self.interner.intern(content);
1184                    tokens.push(Token::new(TokenType::InterpolatedString(sym), sym, span));
1185                } else {
1186                    // Collapse {{ → { and }} → } for plain strings
1187                    let normalized = content.replace("{{", "{").replace("}}", "}");
1188                    let sym = self.interner.intern(&normalized);
1189                    tokens.push(Token::new(TokenType::StringLiteral(sym), sym, span));
1190                }
1191                self.pos += 1;
1192                continue;
1193            }
1194
1195            // Check for character literal marker
1196            if word.starts_with("\x00CHAR:") {
1197                let content = &word[6..]; // Skip the marker prefix
1198                let sym = self.interner.intern(content);
1199                let span = Span::new(word_start, word_end);
1200                tokens.push(Token::new(TokenType::CharLiteral(sym), sym, span));
1201                self.pos += 1;
1202                continue;
1203            }
1204
1205            // Check for escape block marker (pre-captured raw foreign code)
1206            if word.starts_with("\x00ESC:") {
1207                let content = &word[5..]; // Skip the "\x00ESC:" prefix
1208                let sym = self.interner.intern(content);
1209                let span = Span::new(word_start, word_end);
1210                tokens.push(Token::new(TokenType::EscapeBlock(sym), sym, span));
1211                self.pos += 1;
1212                continue;
1213            }
1214
1215            let kind = self.classify_with_lookahead(&word);
1216            let lexeme = self.interner.intern(&word);
1217            let span = Span::new(word_start, word_end);
1218            tokens.push(Token::new(kind, lexeme, span));
1219
1220            if let Some(punct) = trailing_punct {
1221                if punct == '\'' {
1222                    if let Some(next_item) = self.words.get(self.pos + 1) {
1223                        if next_item.word.to_lowercase() == "s" {
1224                            let poss_lexeme = self.interner.intern("'s");
1225                            let poss_start = punct_pos.unwrap_or(word_end);
1226                            let poss_end = next_item.end;
1227                            tokens.push(Token::new(TokenType::Possessive, poss_lexeme, Span::new(poss_start, poss_end)));
1228                            self.pos += 1;
1229                            if let Some(s_punct) = next_item.trailing_punct {
1230                                let kind = match s_punct {
1231                                    '(' => TokenType::LParen,
1232                                    ')' => TokenType::RParen,
1233                                    '[' => TokenType::LBracket,
1234                                    ']' => TokenType::RBracket,
1235                                    ',' => TokenType::Comma,
1236                                    ':' => TokenType::Colon,
1237                                    '.' | '?' => TokenType::Period,
1238                                    '!' => TokenType::Exclamation,
1239                                    '+' => TokenType::Plus,
1240                                    '-' => TokenType::Minus,
1241                                    '*' => TokenType::Star,
1242                                    '/' => TokenType::Slash,
1243                                    '%' => TokenType::Percent,
1244                                    '<' => TokenType::Lt,
1245                                    '>' => TokenType::Gt,
1246                                    '=' => TokenType::Assign,
1247                                    _ => {
1248                                        self.pos += 1;
1249                                        continue;
1250                                    }
1251                                };
1252                                let s_punct_pos = next_item.punct_pos.unwrap_or(next_item.end);
1253                                let lexeme = self.interner.intern(&s_punct.to_string());
1254                                tokens.push(Token::new(kind, lexeme, Span::new(s_punct_pos, s_punct_pos + 1)));
1255                            }
1256                            self.pos += 1;
1257                            continue;
1258                        }
1259                    }
1260                    self.pos += 1;
1261                    continue;
1262                }
1263
1264                let kind = match punct {
1265                    '(' => TokenType::LParen,
1266                    ')' => TokenType::RParen,
1267                    '[' => TokenType::LBracket,
1268                    ']' => TokenType::RBracket,
1269                    ',' => TokenType::Comma,
1270                    ':' => TokenType::Colon,
1271                    '.' | '?' => {
1272                        self.in_let_context = false;
1273                        TokenType::Period
1274                    }
1275                    '!' => TokenType::Exclamation,
1276                    '+' => TokenType::Plus,
1277                    '-' => TokenType::Minus,
1278                    '*' => TokenType::Star,
1279                    '/' => TokenType::Slash,
1280                    '%' => TokenType::Percent,
1281                    '<' => TokenType::Lt,
1282                    '>' => TokenType::Gt,
1283                    '=' => TokenType::Assign,
1284                    _ => {
1285                        self.pos += 1;
1286                        continue;
1287                    }
1288                };
1289                let p_start = punct_pos.unwrap_or(word_end);
1290                let lexeme = self.interner.intern(&punct.to_string());
1291                tokens.push(Token::new(kind, lexeme, Span::new(p_start, p_start + 1)));
1292            }
1293
1294            self.pos += 1;
1295        }
1296
1297        let eof_lexeme = self.interner.intern("");
1298        let eof_span = Span::new(self.input_len, self.input_len);
1299        tokens.push(Token::new(TokenType::EOF, eof_lexeme, eof_span));
1300
1301        self.insert_indentation_tokens(tokens)
1302    }
1303
1304    /// Insert Indent/Dedent tokens using LineLexer's two-pass architecture (Spec §2.5.2).
1305    ///
1306    /// Phase 1: LineLexer determines the structural layout (where indents/dedents occur)
1307    /// Phase 2: We correlate these with word token positions
1308    fn insert_indentation_tokens(&mut self, tokens: Vec<Token>) -> Vec<Token> {
1309        let mut result = Vec::new();
1310        let empty_sym = self.interner.intern("");
1311
1312        // Phase 1: Run LineLexer to determine structural positions
1313        let line_lexer = LineLexer::new(&self.source);
1314        let line_tokens: Vec<LineToken> = line_lexer.collect();
1315
1316        // Build a list of (byte_position, is_indent) for structural tokens
1317        // Position is where the NEXT Content starts after the Indent/Dedent
1318        let mut structural_events: Vec<(usize, bool)> = Vec::new(); // (byte_pos, true=Indent, false=Dedent)
1319        let mut pending_indents = 0usize;
1320        let mut pending_dedents = 0usize;
1321
1322        for line_token in &line_tokens {
1323            match line_token {
1324                LineToken::Indent => {
1325                    pending_indents += 1;
1326                }
1327                LineToken::Dedent => {
1328                    pending_dedents += 1;
1329                }
1330                LineToken::Content { start, .. } => {
1331                    // Emit pending dedents first (they come BEFORE the content)
1332                    for _ in 0..pending_dedents {
1333                        structural_events.push((*start, false)); // false = Dedent
1334                    }
1335                    pending_dedents = 0;
1336
1337                    // Emit pending indents (they also come BEFORE the content)
1338                    for _ in 0..pending_indents {
1339                        structural_events.push((*start, true)); // true = Indent
1340                    }
1341                    pending_indents = 0;
1342                }
1343                LineToken::Newline => {}
1344            }
1345        }
1346
1347        // Handle any remaining dedents at EOF
1348        for _ in 0..pending_dedents {
1349            structural_events.push((self.input_len, false));
1350        }
1351
1352        // Filter out structural events from within escape block bodies.
1353        // The LineLexer sees raw Rust code lines and generates spurious Indent/Dedent
1354        // events for their indentation changes. We keep exactly the boundary events
1355        // (Indent at body start, Dedent at body end) but remove internal ones.
1356        if !self.escape_body_ranges.is_empty() {
1357            // For each escape body range, find the first Indent at the body start and
1358            // track that we're inside the range. Filter out all events strictly inside
1359            // the range except for the first Indent and events at/after the end.
1360            let mut filtered = Vec::new();
1361            for &(pos, is_indent) in &structural_events {
1362                let is_inside_escape_body = self.escape_body_ranges.iter().any(|(start, end)| {
1363                    // Strictly inside the body (not at start boundary and not at/after end)
1364                    pos > *start && pos < *end
1365                });
1366                if !is_inside_escape_body {
1367                    filtered.push((pos, is_indent));
1368                }
1369            }
1370            structural_events = filtered;
1371        }
1372
1373        // Filter out structural events from within multi-line string literals.
1374        // Triple-quote strings span multiple lines; their internal indentation
1375        // must not generate Indent/Dedent tokens.
1376        {
1377            let string_spans: Vec<(usize, usize)> = tokens.iter()
1378                .filter(|t| matches!(t.kind, TokenType::StringLiteral(_) | TokenType::InterpolatedString(_)))
1379                .filter(|t| t.span.end - t.span.start > 6) // only multi-line strings (""" adds >=6 chars)
1380                .map(|t| (t.span.start, t.span.end))
1381                .collect();
1382            if !string_spans.is_empty() {
1383                structural_events.retain(|&(pos, _)| {
1384                    !string_spans.iter().any(|(start, end)| pos > *start && pos < *end)
1385                });
1386            }
1387        }
1388
1389        // Sort events by position, with dedents before indents at same position
1390        structural_events.sort_by(|a, b| {
1391            if a.0 != b.0 {
1392                a.0.cmp(&b.0)
1393            } else {
1394                // Dedents (false) before Indents (true) at same position
1395                a.1.cmp(&b.1)
1396            }
1397        });
1398
1399        // Phase 2: Insert structural tokens at the right positions
1400        // Strategy: For each word token, check if any structural events should be inserted
1401        // before it (based on byte position)
1402
1403        let mut event_idx = 0;
1404        let mut last_colon_pos: Option<usize> = None;
1405
1406        for token in tokens.iter() {
1407            let token_start = token.span.start;
1408
1409            // Insert any structural tokens that should come BEFORE this token
1410            while event_idx < structural_events.len() {
1411                let (event_pos, is_indent) = structural_events[event_idx];
1412
1413                // Insert structural tokens before this token if the event position <= token start
1414                if event_pos <= token_start {
1415                    let span = if is_indent {
1416                        // Indent is inserted after the preceding Colon
1417                        Span::new(last_colon_pos.unwrap_or(event_pos), last_colon_pos.unwrap_or(event_pos))
1418                    } else {
1419                        Span::new(event_pos, event_pos)
1420                    };
1421                    let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1422                    result.push(Token::new(kind, empty_sym, span));
1423                    event_idx += 1;
1424                } else {
1425                    break;
1426                }
1427            }
1428
1429            result.push(token.clone());
1430
1431            // Track colon positions for Indent span calculation
1432            if token.kind == TokenType::Colon && self.is_end_of_line(token.span.end) {
1433                last_colon_pos = Some(token.span.end);
1434            }
1435        }
1436
1437        // Insert any remaining structural tokens (typically Dedents at EOF)
1438        while event_idx < structural_events.len() {
1439            let (event_pos, is_indent) = structural_events[event_idx];
1440            let span = Span::new(event_pos, event_pos);
1441            let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1442            result.push(Token::new(kind, empty_sym, span));
1443            event_idx += 1;
1444        }
1445
1446        // Ensure EOF is at the end
1447        let eof_pos = result.iter().position(|t| t.kind == TokenType::EOF);
1448        if let Some(pos) = eof_pos {
1449            let eof = result.remove(pos);
1450            result.push(eof);
1451        }
1452
1453        result
1454    }
1455
1456    /// Check if position is at end of line (only whitespace until newline)
1457    fn is_end_of_line(&self, from_pos: usize) -> bool {
1458        let bytes = self.source.as_bytes();
1459        let mut pos = from_pos;
1460        while pos < bytes.len() {
1461            match bytes[pos] {
1462                b' ' | b'\t' => pos += 1,
1463                b'\n' => return true,
1464                _ => return false,
1465            }
1466        }
1467        true // End of input is also end of line
1468    }
1469
1470    fn measure_next_line_indent(&self, from_pos: usize) -> Option<usize> {
1471        let bytes = self.source.as_bytes();
1472        let mut pos = from_pos;
1473
1474        while pos < bytes.len() && bytes[pos] != b'\n' {
1475            pos += 1;
1476        }
1477
1478        if pos >= bytes.len() {
1479            return None;
1480        }
1481
1482        pos += 1;
1483
1484        let mut indent = 0;
1485        while pos < bytes.len() {
1486            match bytes[pos] {
1487                b' ' => indent += 1,
1488                b'\t' => indent += 4,
1489                b'\n' => {
1490                    indent = 0;
1491                }
1492                _ => break,
1493            }
1494            pos += 1;
1495        }
1496
1497        if pos >= bytes.len() {
1498            return None;
1499        }
1500
1501        Some(indent)
1502    }
1503
1504    fn word_to_number(word: &str) -> Option<u32> {
1505        lexicon::word_to_number(&word.to_lowercase())
1506    }
1507
1508    /// Check if a hyphen at the current position is part of an ISO-8601 date.
1509    ///
1510    /// Detects patterns like:
1511    /// - "2026-" followed by "05-20" → first hyphen of date
1512    /// - "2026-05-" followed by "20" → second hyphen of date
1513    fn is_date_hyphen(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1514        // Current word must be all digits (year or year-month)
1515        let word_chars: Vec<char> = current_word.chars().collect();
1516
1517        // Check for first hyphen pattern: YYYY- followed by MM-DD
1518        if word_chars.len() == 4 && word_chars.iter().all(|c| c.is_ascii_digit()) {
1519            // Check if followed by exactly 2 digits, hyphen, 2 digits
1520            if char_idx + 5 < chars.len()
1521                && chars[char_idx + 1].is_ascii_digit()
1522                && chars[char_idx + 2].is_ascii_digit()
1523                && chars[char_idx + 3] == '-'
1524                && chars[char_idx + 4].is_ascii_digit()
1525                && chars[char_idx + 5].is_ascii_digit()
1526            {
1527                return true;
1528            }
1529        }
1530
1531        // Check for second hyphen pattern: YYYY-MM- followed by DD
1532        if word_chars.len() == 7
1533            && word_chars[0..4].iter().all(|c| c.is_ascii_digit())
1534            && word_chars[4] == '-'
1535            && word_chars[5..7].iter().all(|c| c.is_ascii_digit())
1536        {
1537            // Check if followed by exactly 2 digits
1538            if char_idx + 2 < chars.len()
1539                && chars[char_idx + 1].is_ascii_digit()
1540                && chars[char_idx + 2].is_ascii_digit()
1541            {
1542                // Make sure we're not followed by more digits (would be a longer number)
1543                let next_not_digit = char_idx + 3 >= chars.len()
1544                    || !chars[char_idx + 3].is_ascii_digit();
1545                if next_not_digit {
1546                    return true;
1547                }
1548            }
1549        }
1550
1551        false
1552    }
1553
1554    /// Check if a colon is part of a time literal (e.g., 9:30am, 11:45pm).
1555    ///
1556    /// Detects patterns like:
1557    /// - "9:" followed by "30am" or "30pm"
1558    /// - "11:" followed by "45pm"
1559    fn is_time_colon(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1560        // Current word must be 1-2 digits (hour)
1561        let word_chars: Vec<char> = current_word.chars().collect();
1562        if word_chars.is_empty() || word_chars.len() > 2 {
1563            return false;
1564        }
1565        if !word_chars.iter().all(|c| c.is_ascii_digit()) {
1566            return false;
1567        }
1568
1569        // Check if followed by exactly 2 digits and then "am" or "pm"
1570        if char_idx + 4 < chars.len()
1571            && chars[char_idx + 1].is_ascii_digit()
1572            && chars[char_idx + 2].is_ascii_digit()
1573        {
1574            // Check for "am" or "pm" suffix
1575            let next_two: String = chars[char_idx + 3..char_idx + 5].iter().collect();
1576            let lower = next_two.to_lowercase();
1577            if lower == "am" || lower == "pm" {
1578                // Make sure we're not followed by more alphabetic chars
1579                let after_suffix = char_idx + 5 >= chars.len()
1580                    || !chars[char_idx + 5].is_alphabetic();
1581                if after_suffix {
1582                    return true;
1583                }
1584            }
1585        }
1586
1587        false
1588    }
1589
1590    /// Check if a string contains an unescaped `{` (i.e., not part of `{{`).
1591    /// Used to distinguish `InterpolatedString` from `StringLiteral`.
1592    fn has_unescaped_brace(content: &str) -> bool {
1593        let bytes = content.as_bytes();
1594        let mut i = 0;
1595        while i < bytes.len() {
1596            if bytes[i] == b'{' {
1597                if i + 1 < bytes.len() && bytes[i + 1] == b'{' {
1598                    i += 2;
1599                } else {
1600                    return true;
1601                }
1602            } else {
1603                i += 1;
1604            }
1605        }
1606        false
1607    }
1608
1609    /// Check if a `+` or `-` at the current position is the sign of a scientific notation exponent.
1610    ///
1611    /// Detects patterns like:
1612    /// - "4.84e+" followed by "00" → exponent sign in `4.84e+00`
1613    /// - "2.5e-" followed by "2"  → exponent sign in `2.5e-2`
1614    fn is_exponent_sign(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1615        // Word must end with e/E
1616        if !current_word.ends_with('e') && !current_word.ends_with('E') {
1617            return false;
1618        }
1619        // Before e/E must contain a digit (ensures it's a number, not a bare "e")
1620        let before_e = &current_word[..current_word.len() - 1];
1621        if before_e.is_empty() || !before_e.chars().next().unwrap().is_ascii_digit() {
1622            return false;
1623        }
1624        // Next char must be a digit (the exponent value)
1625        char_idx + 1 < chars.len() && chars[char_idx + 1].is_ascii_digit()
1626    }
1627
1628    /// Dedent a triple-quoted string: strip the common leading whitespace from each line.
1629    /// Joins lines with literal newline characters (not escape sequences).
1630    fn dedent_triple_quote(raw: &str) -> String {
1631        let lines: Vec<&str> = raw.lines().collect();
1632        if lines.is_empty() {
1633            return String::new();
1634        }
1635        // Find minimum indentation of non-empty lines
1636        let min_indent = lines.iter()
1637            .filter(|l| !l.trim().is_empty())
1638            .map(|l| l.len() - l.trim_start().len())
1639            .min()
1640            .unwrap_or(0);
1641        // Strip that indentation and join with actual newlines
1642        lines.iter()
1643            .map(|l| {
1644                if l.len() >= min_indent {
1645                    &l[min_indent..]
1646                } else {
1647                    l.trim()
1648                }
1649            })
1650            .collect::<Vec<_>>()
1651            .join("\n")
1652    }
1653
1654    fn is_numeric_literal(word: &str) -> bool {
1655        if word.is_empty() {
1656            return false;
1657        }
1658        let chars: Vec<char> = word.chars().collect();
1659        let first = chars[0];
1660        if first.is_ascii_digit() {
1661            // Numeric literal: starts with digit (may have underscore separators like 1_000)
1662            return true;
1663        }
1664        // Symbolic numbers: only recognize known mathematical symbols
1665        // (aleph, omega, beth) followed by underscore and digits
1666        if let Some(underscore_pos) = word.rfind('_') {
1667            let before_underscore = &word[..underscore_pos];
1668            let after_underscore = &word[underscore_pos + 1..];
1669            // Must be a known mathematical symbol prefix AND digits after underscore
1670            let is_math_symbol = matches!(
1671                before_underscore.to_lowercase().as_str(),
1672                "aleph" | "omega" | "beth"
1673            );
1674            if is_math_symbol
1675                && !after_underscore.is_empty()
1676                && after_underscore.chars().all(|c| c.is_ascii_digit())
1677            {
1678                return true;
1679            }
1680        }
1681        false
1682    }
1683
1684    /// Parse a duration literal with SI suffix.
1685    ///
1686    /// Returns Some((nanoseconds, unit_str)) if the word is a valid duration literal,
1687    /// None otherwise.
1688    ///
1689    /// Supported suffixes:
1690    /// - ns: nanoseconds
1691    /// - us, μs: microseconds
1692    /// - ms: milliseconds
1693    /// - s, sec: seconds
1694    /// - min: minutes
1695    /// - h, hr: hours
1696    fn parse_duration_literal(word: &str) -> Option<(i64, &str)> {
1697        if word.is_empty() || !word.chars().next()?.is_ascii_digit() {
1698            return None;
1699        }
1700
1701        // SI suffix table with multipliers to nanoseconds
1702        const SUFFIXES: &[(&str, i64)] = &[
1703            ("ns", 1),
1704            ("μs", 1_000),
1705            ("us", 1_000),
1706            ("ms", 1_000_000),
1707            ("sec", 1_000_000_000),
1708            ("s", 1_000_000_000),
1709            ("min", 60_000_000_000),
1710            ("hr", 3_600_000_000_000),
1711            ("h", 3_600_000_000_000),
1712        ];
1713
1714        // Try each suffix (longer suffixes first to avoid partial matches)
1715        for (suffix, multiplier) in SUFFIXES {
1716            if word.ends_with(suffix) {
1717                let num_part = &word[..word.len() - suffix.len()];
1718                // Parse the numeric part (may have underscore separators)
1719                let cleaned: String = num_part.chars().filter(|c| *c != '_').collect();
1720                if let Ok(n) = cleaned.parse::<i64>() {
1721                    return Some((n.saturating_mul(*multiplier), *suffix));
1722                }
1723            }
1724        }
1725
1726        None
1727    }
1728
1729    /// Parse an ISO-8601 date literal (YYYY-MM-DD).
1730    ///
1731    /// Returns Some(days_since_epoch) if the word is a valid date literal,
1732    /// None otherwise.
1733    fn parse_date_literal(word: &str) -> Option<i32> {
1734        // Must match pattern: YYYY-MM-DD
1735        if word.len() != 10 {
1736            return None;
1737        }
1738
1739        let bytes = word.as_bytes();
1740
1741        // Check format: 4 digits, hyphen, 2 digits, hyphen, 2 digits
1742        if bytes[4] != b'-' || bytes[7] != b'-' {
1743            return None;
1744        }
1745
1746        // Parse year, month, day
1747        let year: i32 = word[0..4].parse().ok()?;
1748        let month: u32 = word[5..7].parse().ok()?;
1749        let day: u32 = word[8..10].parse().ok()?;
1750
1751        // Basic validation
1752        if month < 1 || month > 12 || day < 1 || day > 31 {
1753            return None;
1754        }
1755
1756        // Convert to days since Unix epoch using Howard Hinnant's algorithm
1757        // https://howardhinnant.github.io/date_algorithms.html
1758        let y = if month <= 2 { year - 1 } else { year };
1759        let era = if y >= 0 { y / 400 } else { (y - 399) / 400 };
1760        let yoe = (y - era * 400) as u32;
1761        let m = month;
1762        let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) + 2) / 5 + day - 1;
1763        let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1764        let days = era * 146097 + doe as i32 - 719468;
1765
1766        Some(days)
1767    }
1768
1769    /// Parse a time-of-day literal.
1770    ///
1771    /// Supported formats:
1772    /// - 12-hour with am/pm: "4pm", "9am", "12pm"
1773    /// - 12-hour with minutes: "9:30am", "11:45pm"
1774    /// - Special words: "noon" (12:00), "midnight" (00:00)
1775    ///
1776    /// Returns Some(nanos_from_midnight) if valid, None otherwise.
1777    fn parse_time_literal(word: &str) -> Option<i64> {
1778        let lower = word.to_lowercase();
1779
1780        // Handle special time words
1781        if lower == "noon" {
1782            return Some(12i64 * 3600 * 1_000_000_000);
1783        }
1784        if lower == "midnight" {
1785            return Some(0);
1786        }
1787
1788        // Handle 12-hour formats: "4pm", "9am", "9:30am", "11:45pm"
1789        let is_pm = lower.ends_with("pm");
1790        let is_am = lower.ends_with("am");
1791
1792        if !is_pm && !is_am {
1793            return None;
1794        }
1795
1796        // Strip the am/pm suffix
1797        let time_part = &lower[..lower.len() - 2];
1798
1799        // Check for hour:minute format
1800        let (hour, minute): (i64, i64) = if let Some(colon_idx) = time_part.find(':') {
1801            let hour_str = &time_part[..colon_idx];
1802            let min_str = &time_part[colon_idx + 1..];
1803            let h: i64 = hour_str.parse().ok()?;
1804            let m: i64 = min_str.parse().ok()?;
1805            (h, m)
1806        } else {
1807            // Just hour: "4pm", "9am"
1808            let h: i64 = time_part.parse().ok()?;
1809            (h, 0)
1810        };
1811
1812        // Validate ranges
1813        if hour < 1 || hour > 12 || minute < 0 || minute > 59 {
1814            return None;
1815        }
1816
1817        // Convert to 24-hour format
1818        let hour_24 = if is_am {
1819            if hour == 12 { 0 } else { hour }  // 12am = midnight = 0
1820        } else {
1821            if hour == 12 { 12 } else { hour + 12 }  // 12pm = noon = 12, 4pm = 16
1822        };
1823
1824        // Convert to nanoseconds from midnight
1825        let nanos = (hour_24 * 3600 + minute * 60) * 1_000_000_000;
1826        Some(nanos)
1827    }
1828
1829    fn classify_with_lookahead(&mut self, word: &str) -> TokenType {
1830        // Handle block headers (##Theorem, ##Main, etc.)
1831        if word.starts_with("##") {
1832            let block_name = &word[2..];
1833            let block_type = match block_name.to_lowercase().as_str() {
1834                "theorem" => BlockType::Theorem,
1835                "main" => BlockType::Main,
1836                "definition" => BlockType::Definition,
1837                "proof" => BlockType::Proof,
1838                "example" => BlockType::Example,
1839                "logic" => BlockType::Logic,
1840                "note" => BlockType::Note,
1841                "to" => BlockType::Function,  // Function definition block
1842                "a" | "an" => BlockType::TypeDef,  // Inline type definitions: ## A Point has:
1843                "policy" => BlockType::Policy,  // Security policy definitions
1844                "requires" => BlockType::Requires,  // External crate dependencies
1845                "no" => BlockType::No,  // Optimization annotation: ## No Memo, ## No TCO, etc.
1846                _ => BlockType::Note, // Default unknown block types to Note
1847            };
1848
1849            // Update lexer mode based on block type
1850            self.mode = match block_type {
1851                BlockType::Main | BlockType::Function => LexerMode::Imperative,
1852                _ => LexerMode::Declarative,
1853            };
1854
1855            return TokenType::BlockHeader { block_type };
1856        }
1857
1858        let lower = word.to_lowercase();
1859
1860        if lower == "each" && self.peek_sequence(&["other"]) {
1861            self.consume_words(1);
1862            return TokenType::Reciprocal;
1863        }
1864
1865        if lower == "to" {
1866            if let Some(next) = self.peek_word(1) {
1867                if self.is_verb_like(next) {
1868                    return TokenType::To;
1869                }
1870            }
1871            let sym = self.interner.intern("to");
1872            return TokenType::Preposition(sym);
1873        }
1874
1875        if lower == "at" {
1876            if let Some(next) = self.peek_word(1) {
1877                let next_lower = next.to_lowercase();
1878                if next_lower == "least" {
1879                    if let Some(num_word) = self.peek_word(2) {
1880                        if let Some(n) = Self::word_to_number(num_word) {
1881                            self.consume_words(2);
1882                            return TokenType::AtLeast(n);
1883                        }
1884                    }
1885                }
1886                if next_lower == "most" {
1887                    if let Some(num_word) = self.peek_word(2) {
1888                        if let Some(n) = Self::word_to_number(num_word) {
1889                            self.consume_words(2);
1890                            return TokenType::AtMost(n);
1891                        }
1892                    }
1893                }
1894            }
1895        }
1896
1897        if let Some(n) = Self::word_to_number(&lower) {
1898            return TokenType::Cardinal(n);
1899        }
1900
1901        // Check for duration literal first (e.g., "500ms", "2s", "50ns")
1902        if let Some((nanos, unit)) = Self::parse_duration_literal(word) {
1903            let unit_sym = self.interner.intern(unit);
1904            return TokenType::DurationLiteral {
1905                nanos,
1906                original_unit: unit_sym,
1907            };
1908        }
1909
1910        // Check for ISO-8601 date literal (e.g., "2026-05-20")
1911        if let Some(days) = Self::parse_date_literal(word) {
1912            return TokenType::DateLiteral { days };
1913        }
1914
1915        // Check for time-of-day literal (e.g., "4pm", "9:30am", "noon", "midnight")
1916        if let Some(nanos_from_midnight) = Self::parse_time_literal(word) {
1917            return TokenType::TimeLiteral { nanos_from_midnight };
1918        }
1919
1920        if Self::is_numeric_literal(word) {
1921            let sym = self.interner.intern(word);
1922            return TokenType::Number(sym);
1923        }
1924
1925        if lower == "if" && self.peek_sequence(&["and", "only", "if"]) {
1926            self.consume_words(3);
1927            return TokenType::Iff;
1928        }
1929
1930        if lower == "is" {
1931            if self.peek_sequence(&["equal", "to"]) {
1932                self.consume_words(2);
1933                return TokenType::Identity;
1934            }
1935            if self.peek_sequence(&["identical", "to"]) {
1936                self.consume_words(2);
1937                return TokenType::Identity;
1938            }
1939        }
1940
1941        if (lower == "a" || lower == "an") && word.chars().next().unwrap().is_uppercase() {
1942            // Capitalized "A" or "An" - disambiguate article vs proper name
1943            // Heuristic: articles are followed by nouns/adjectives, not verbs or keywords
1944            if let Some(next) = self.peek_word(1) {
1945                let next_lower = next.to_lowercase();
1946                let next_starts_lowercase = next.chars().next().map(|c| c.is_lowercase()).unwrap_or(false);
1947
1948                // If followed by logical keyword, treat as proper name (propositional variable)
1949                if matches!(next_lower.as_str(), "if" | "and" | "or" | "implies" | "iff") {
1950                    let sym = self.interner.intern(word);
1951                    return TokenType::ProperName(sym);
1952                }
1953
1954                // If next word is ONLY a verb (like "has", "is", "ran"), A is likely a name
1955                // Exception: gerunds (like "running") can follow articles
1956                // Exception: words in disambiguation_not_verbs (like "red") are not verbs
1957                // Exception: words that are also nouns/adjectives (like "fire") can follow articles
1958                let is_verb = self.lexicon.lookup_verb(&next_lower).is_some()
1959                    && !lexicon::is_disambiguation_not_verb(&next_lower);
1960                let is_gerund = next_lower.ends_with("ing");
1961                let is_also_noun_or_adj = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1962                if is_verb && !is_gerund && !is_also_noun_or_adj {
1963                    let sym = self.interner.intern(word);
1964                    return TokenType::ProperName(sym);
1965                }
1966
1967                // Definition pattern: "A [TypeName] is a..." or "A [TypeName] has:" - treat A as article
1968                // even when TypeName is capitalized and unknown
1969                if let Some(third) = self.peek_word(2) {
1970                    let third_lower = third.to_lowercase();
1971                    // "has" for struct definitions: "A Point has:"
1972                    if third_lower == "is" || third_lower == "are" || third_lower == "has" {
1973                        return TokenType::Article(Definiteness::Indefinite);
1974                    }
1975                }
1976
1977                // It's an article if next word is:
1978                // - A known noun or adjective, or
1979                // - Lowercase (likely a common word we don't recognize)
1980                let is_content_word = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1981                if is_content_word || next_starts_lowercase {
1982                    return TokenType::Article(Definiteness::Indefinite);
1983                }
1984            }
1985            let sym = self.interner.intern(word);
1986            return TokenType::ProperName(sym);
1987        }
1988
1989        self.classify_word(word)
1990    }
1991
1992    fn is_noun_like(&self, word: &str) -> bool {
1993        if lexicon::is_noun_pattern(word) || lexicon::is_common_noun(word) {
1994            return true;
1995        }
1996        if word.ends_with("er") || word.ends_with("ian") || word.ends_with("ist") {
1997            return true;
1998        }
1999        false
2000    }
2001
2002    fn is_adjective_like(&self, word: &str) -> bool {
2003        lexicon::is_adjective(word) || lexicon::is_non_intersective(word)
2004    }
2005
2006    fn classify_word(&mut self, word: &str) -> TokenType {
2007        let lower = word.to_lowercase();
2008        let first_char = word.chars().next().unwrap();
2009
2010        // Disambiguate "that" as determiner vs complementizer
2011        // "that dog" → Article(Distal), "I know that he ran" → That (complementizer)
2012        if lower == "that" {
2013            if let Some(next) = self.peek_word(1) {
2014                let next_lower = next.to_lowercase();
2015                if self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower) {
2016                    return TokenType::Article(Definiteness::Distal);
2017                }
2018            }
2019        }
2020
2021        // Arrow token for return type syntax
2022        if word == "->" {
2023            return TokenType::Arrow;
2024        }
2025
2026        // Grand Challenge: Comparison operator tokens
2027        if word == "<=" {
2028            return TokenType::LtEq;
2029        }
2030        if word == ">=" {
2031            return TokenType::GtEq;
2032        }
2033        if word == "==" {
2034            return TokenType::EqEq;
2035        }
2036        if word == "!=" {
2037            return TokenType::NotEq;
2038        }
2039        if word == "<" {
2040            return TokenType::Lt;
2041        }
2042        if word == ">" {
2043            return TokenType::Gt;
2044        }
2045        // Single = for assignment (must come after == check)
2046        if word == "=" {
2047            return TokenType::Assign;
2048        }
2049
2050        if let Some(kind) = lexicon::lookup_keyword(&lower) {
2051            return kind;
2052        }
2053
2054        if let Some(kind) = lexicon::lookup_pronoun(&lower) {
2055            return kind;
2056        }
2057
2058        if let Some(def) = lexicon::lookup_article(&lower) {
2059            return TokenType::Article(def);
2060        }
2061
2062        if let Some(time) = lexicon::lookup_auxiliary(&lower) {
2063            return TokenType::Auxiliary(time);
2064        }
2065
2066        // Handle imperative keywords that might conflict with prepositions
2067        match lower.as_str() {
2068            "call" => return TokenType::Call,
2069            "in" if self.mode == LexerMode::Imperative => return TokenType::In,
2070            // Zone keywords (must come before is_preposition check)
2071            "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2072            // "at" for chunk access (must come before is_preposition check)
2073            "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2074            // "into" for pipe send (must come before is_preposition check)
2075            "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2076            // Temporal span operator (must come before is_preposition check)
2077            "before" => return TokenType::Before,
2078            _ => {}
2079        }
2080
2081        if lexicon::is_preposition(&lower) {
2082            let sym = self.interner.intern(&lower);
2083            return TokenType::Preposition(sym);
2084        }
2085
2086        match lower.as_str() {
2087            "equals" => return TokenType::Equals,
2088            "item" => return TokenType::Item,
2089            "items" => return TokenType::Items,
2090            // Mutability keyword for `mut x = 5` syntax
2091            "mut" if self.mode == LexerMode::Imperative => return TokenType::Mut,
2092            "let" => {
2093                self.in_let_context = true;
2094                return TokenType::Let;
2095            }
2096            "set" => {
2097                // Check if "set" is used as a type (followed by "of") - "Set of Int"
2098                // This takes priority over the assignment keyword
2099                if self.peek_word(1).map_or(false, |w| w.to_lowercase() == "of") {
2100                    // It's a type like "Set of Int" - don't return keyword, let it be a noun
2101                } else if self.mode == LexerMode::Imperative {
2102                    // In Imperative mode, treat "set" as the assignment keyword
2103                    return TokenType::Set;
2104                } else {
2105                    // In Declarative mode, check positions 2-5 for "to"
2106                    // (handles field access like "set p's x to")
2107                    for offset in 2..=5 {
2108                        if self.peek_word(offset).map_or(false, |w| w.to_lowercase() == "to") {
2109                            return TokenType::Set;
2110                        }
2111                    }
2112                }
2113            }
2114            "return" => return TokenType::Return,
2115            "break" => return TokenType::Break,
2116            "xor" => return TokenType::Xor,
2117            "shifted" => return TokenType::Shifted,
2118            "be" if self.in_let_context => {
2119                self.in_let_context = false;
2120                return TokenType::Be;
2121            }
2122            "while" => return TokenType::While,
2123            "assert" => return TokenType::Assert,
2124            "trust" => return TokenType::Trust,
2125            "check" => return TokenType::Check,
2126            // Theorem keywords (Declarative mode - for theorem blocks)
2127            "given" if self.mode == LexerMode::Declarative => return TokenType::Given,
2128            "prove" if self.mode == LexerMode::Declarative => return TokenType::Prove,
2129            "auto" if self.mode == LexerMode::Declarative => return TokenType::Auto,
2130            // P2P Networking keywords (Imperative mode only)
2131            "listen" if self.mode == LexerMode::Imperative => return TokenType::Listen,
2132            "connect" if self.mode == LexerMode::Imperative => return TokenType::NetConnect,
2133            "sleep" if self.mode == LexerMode::Imperative => return TokenType::Sleep,
2134            // GossipSub keywords (Imperative mode only)
2135            "sync" if self.mode == LexerMode::Imperative => return TokenType::Sync,
2136            // Persistence keywords
2137            "mount" if self.mode == LexerMode::Imperative => return TokenType::Mount,
2138            "persistent" => return TokenType::Persistent,  // Works in type expressions
2139            "combined" if self.mode == LexerMode::Imperative => return TokenType::Combined,
2140            // Go-like Concurrency keywords (Imperative mode only)
2141            // Note: "first" and "after" are NOT keywords - they're checked via lookahead in parser
2142            // to avoid conflicting with their use as variable names
2143            "launch" if self.mode == LexerMode::Imperative => return TokenType::Launch,
2144            "task" if self.mode == LexerMode::Imperative => return TokenType::Task,
2145            "pipe" if self.mode == LexerMode::Imperative => return TokenType::Pipe,
2146            "receive" if self.mode == LexerMode::Imperative => return TokenType::Receive,
2147            "stop" if self.mode == LexerMode::Imperative => return TokenType::Stop,
2148            "try" if self.mode == LexerMode::Imperative => return TokenType::Try,
2149            "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2150            "native" => return TokenType::Native,
2151            "escape" if self.mode == LexerMode::Imperative => return TokenType::Escape,
2152            "from" => return TokenType::From,
2153            "otherwise" => return TokenType::Otherwise,
2154            // Phase 30c: Else/elif as aliases for Otherwise/Otherwise If
2155            "else" => return TokenType::Else,
2156            "elif" => return TokenType::Elif,
2157            // Sum type definition (Declarative mode only - for enum "either...or...")
2158            "either" if self.mode == LexerMode::Declarative => return TokenType::Either,
2159            // Pattern matching statement
2160            "inspect" if self.mode == LexerMode::Imperative => return TokenType::Inspect,
2161            // Constructor keyword (Imperative mode only)
2162            "new" if self.mode == LexerMode::Imperative => return TokenType::New,
2163            // Only emit Give/Show as keywords in Imperative mode
2164            // In Declarative mode, they fall through to lexicon lookup as verbs
2165            "give" if self.mode == LexerMode::Imperative => return TokenType::Give,
2166            "show" if self.mode == LexerMode::Imperative => return TokenType::Show,
2167            // Collection operation keywords (Imperative mode only)
2168            "push" if self.mode == LexerMode::Imperative => return TokenType::Push,
2169            "pop" if self.mode == LexerMode::Imperative => return TokenType::Pop,
2170            "copy" if self.mode == LexerMode::Imperative => return TokenType::Copy,
2171            "through" if self.mode == LexerMode::Imperative => return TokenType::Through,
2172            "length" if self.mode == LexerMode::Imperative => return TokenType::Length,
2173            "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2174            // Set operation keywords (Imperative mode only)
2175            "add" if self.mode == LexerMode::Imperative => return TokenType::Add,
2176            "remove" if self.mode == LexerMode::Imperative => return TokenType::Remove,
2177            "contains" if self.mode == LexerMode::Imperative => return TokenType::Contains,
2178            "union" if self.mode == LexerMode::Imperative => return TokenType::Union,
2179            "intersection" if self.mode == LexerMode::Imperative => return TokenType::Intersection,
2180            // Zone keywords (Imperative mode only)
2181            "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2182            "zone" if self.mode == LexerMode::Imperative => return TokenType::Zone,
2183            "called" if self.mode == LexerMode::Imperative => return TokenType::Called,
2184            "size" if self.mode == LexerMode::Imperative => return TokenType::Size,
2185            "mapped" if self.mode == LexerMode::Imperative => return TokenType::Mapped,
2186            // Structured Concurrency keywords (Imperative mode only)
2187            "attempt" if self.mode == LexerMode::Imperative => return TokenType::Attempt,
2188            "following" if self.mode == LexerMode::Imperative => return TokenType::Following,
2189            "simultaneously" if self.mode == LexerMode::Imperative => return TokenType::Simultaneously,
2190            // IO keywords (Imperative mode only)
2191            "read" if self.mode == LexerMode::Imperative => return TokenType::Read,
2192            "write" if self.mode == LexerMode::Imperative => return TokenType::Write,
2193            "console" if self.mode == LexerMode::Imperative => return TokenType::Console,
2194            "file" if self.mode == LexerMode::Imperative => return TokenType::File,
2195            // Agent System keywords (Imperative mode only)
2196            "spawn" if self.mode == LexerMode::Imperative => return TokenType::Spawn,
2197            "send" if self.mode == LexerMode::Imperative => return TokenType::Send,
2198            "await" if self.mode == LexerMode::Imperative => return TokenType::Await,
2199            // Serialization keyword (works in Definition blocks too)
2200            "portable" => return TokenType::Portable,
2201            // Sipping Protocol keywords (Imperative mode only)
2202            "manifest" if self.mode == LexerMode::Imperative => return TokenType::Manifest,
2203            "chunk" if self.mode == LexerMode::Imperative => return TokenType::Chunk,
2204            // CRDT keywords
2205            "shared" => return TokenType::Shared,  // Works in Definition blocks like Portable
2206            "merge" if self.mode == LexerMode::Imperative => return TokenType::Merge,
2207            "increase" if self.mode == LexerMode::Imperative => return TokenType::Increase,
2208            // Extended CRDT keywords
2209            "decrease" if self.mode == LexerMode::Imperative => return TokenType::Decrease,
2210            "append" if self.mode == LexerMode::Imperative => return TokenType::Append,
2211            "resolve" if self.mode == LexerMode::Imperative => return TokenType::Resolve,
2212            "values" if self.mode == LexerMode::Imperative => return TokenType::Values,
2213            // Type keywords (work in both modes like "Shared"):
2214            "tally" => return TokenType::Tally,
2215            "sharedset" => return TokenType::SharedSet,
2216            "sharedsequence" => return TokenType::SharedSequence,
2217            "collaborativesequence" => return TokenType::CollaborativeSequence,
2218            "sharedmap" => return TokenType::SharedMap,
2219            "divergent" => return TokenType::Divergent,
2220            "removewins" => return TokenType::RemoveWins,
2221            "addwins" => return TokenType::AddWins,
2222            "yata" => return TokenType::YATA,
2223            // Calendar time unit words (Span expressions)
2224            "day" | "days" => return TokenType::CalendarUnit(CalendarUnit::Day),
2225            "week" | "weeks" => return TokenType::CalendarUnit(CalendarUnit::Week),
2226            "month" | "months" => return TokenType::CalendarUnit(CalendarUnit::Month),
2227            "year" | "years" => return TokenType::CalendarUnit(CalendarUnit::Year),
2228            // Span-related keywords (note: "before" is handled earlier to avoid preposition conflict)
2229            "ago" => return TokenType::Ago,
2230            "hence" => return TokenType::Hence,
2231            "if" => return TokenType::If,
2232            "only" => return TokenType::Focus(FocusKind::Only),
2233            "even" => return TokenType::Focus(FocusKind::Even),
2234            "just" if self.peek_word(1).map_or(false, |w| {
2235                !self.is_verb_like(w) || w.to_lowercase() == "john" || w.chars().next().map_or(false, |c| c.is_uppercase())
2236            }) => return TokenType::Focus(FocusKind::Just),
2237            "much" => return TokenType::Measure(MeasureKind::Much),
2238            "little" => return TokenType::Measure(MeasureKind::Little),
2239            _ => {}
2240        }
2241
2242        if lexicon::is_scopal_adverb(&lower) {
2243            let sym = self.interner.intern(&Self::capitalize(&lower));
2244            return TokenType::ScopalAdverb(sym);
2245        }
2246
2247        if lexicon::is_temporal_adverb(&lower) {
2248            let sym = self.interner.intern(&Self::capitalize(&lower));
2249            return TokenType::TemporalAdverb(sym);
2250        }
2251
2252        if lexicon::is_non_intersective(&lower) {
2253            let sym = self.interner.intern(&Self::capitalize(&lower));
2254            return TokenType::NonIntersectiveAdjective(sym);
2255        }
2256
2257        if lexicon::is_adverb(&lower) {
2258            let sym = self.interner.intern(&Self::capitalize(&lower));
2259            return TokenType::Adverb(sym);
2260        }
2261        if lower.ends_with("ly") && !lexicon::is_not_adverb(&lower) && lower.len() > 4 {
2262            let sym = self.interner.intern(&Self::capitalize(&lower));
2263            return TokenType::Adverb(sym);
2264        }
2265
2266        if let Some(base) = self.try_parse_superlative(&lower) {
2267            let sym = self.interner.intern(&base);
2268            return TokenType::Superlative(sym);
2269        }
2270
2271        // Handle irregular comparatives (less, more, better, worse)
2272        let irregular_comparative = match lower.as_str() {
2273            "less" => Some("Little"),
2274            "more" => Some("Much"),
2275            "better" => Some("Good"),
2276            "worse" => Some("Bad"),
2277            _ => None,
2278        };
2279        if let Some(base) = irregular_comparative {
2280            let sym = self.interner.intern(base);
2281            return TokenType::Comparative(sym);
2282        }
2283
2284        if let Some(base) = self.try_parse_comparative(&lower) {
2285            let sym = self.interner.intern(&base);
2286            return TokenType::Comparative(sym);
2287        }
2288
2289        if lexicon::is_performative(&lower) {
2290            let sym = self.interner.intern(&Self::capitalize(&lower));
2291            return TokenType::Performative(sym);
2292        }
2293
2294        if lexicon::is_base_verb_early(&lower) {
2295            let sym = self.interner.intern(&Self::capitalize(&lower));
2296            let class = lexicon::lookup_verb_class(&lower);
2297            return TokenType::Verb {
2298                lemma: sym,
2299                time: Time::Present,
2300                aspect: Aspect::Simple,
2301                class,
2302            };
2303        }
2304
2305        // Check for gerunds/progressive verbs BEFORE ProperName check
2306        // "Running" at start of sentence should be Verb, not ProperName
2307        if lower.ends_with("ing") && lower.len() > 4 {
2308            if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2309                let sym = self.interner.intern(&entry.lemma);
2310                return TokenType::Verb {
2311                    lemma: sym,
2312                    time: entry.time,
2313                    aspect: entry.aspect,
2314                    class: entry.class,
2315                };
2316            }
2317        }
2318
2319        if first_char.is_uppercase() {
2320            // Smart Lexicon: Check if this capitalized word is actually a common noun
2321            // Only apply for sentence-initial words (followed by verb) to avoid
2322            // breaking type definitions like "A Point has:"
2323            //
2324            // Pattern: "Farmers walk." → Farmers is plural of Farmer (common noun)
2325            // Pattern: "A Point has:" → Point is a type name (proper name)
2326            if let Some(next) = self.peek_word(1) {
2327                let next_lower = next.to_lowercase();
2328                // If next word is a verb, this capitalized word is likely a subject noun
2329                let is_followed_by_verb = self.lexicon.lookup_verb(&next_lower).is_some()
2330                    || matches!(next_lower.as_str(), "is" | "are" | "was" | "were" | "has" | "have" | "had");
2331
2332                if is_followed_by_verb {
2333                    // Check if lowercase version is a derivable common noun
2334                    if let Some(analysis) = lexicon::analyze_word(&lower) {
2335                        match analysis {
2336                            lexicon::WordAnalysis::Noun(meta) if meta.number == lexicon::Number::Plural => {
2337                                // It's a plural noun - definitely a common noun
2338                                let sym = self.interner.intern(&lower);
2339                                return TokenType::Noun(sym);
2340                            }
2341                            lexicon::WordAnalysis::DerivedNoun { number: lexicon::Number::Plural, .. } => {
2342                                // Derived plural agentive noun (e.g., "Bloggers")
2343                                let sym = self.interner.intern(&lower);
2344                                return TokenType::Noun(sym);
2345                            }
2346                            _ => {
2347                                // Singular nouns at sentence start could still be proper names
2348                                // e.g., "John walks." vs "Farmer walks."
2349                            }
2350                        }
2351                    }
2352                }
2353            }
2354
2355            let sym = self.interner.intern(word);
2356            return TokenType::ProperName(sym);
2357        }
2358
2359        let verb_entry = self.lexicon.lookup_verb(&lower);
2360        let is_noun = lexicon::is_common_noun(&lower);
2361        let is_adj = self.is_adjective_like(&lower);
2362        let is_disambiguated = lexicon::is_disambiguation_not_verb(&lower);
2363
2364        // Ambiguous: word is Verb AND (Noun OR Adjective), not disambiguated
2365        if verb_entry.is_some() && (is_noun || is_adj) && !is_disambiguated {
2366            let entry = verb_entry.unwrap();
2367            let verb_token = TokenType::Verb {
2368                lemma: self.interner.intern(&entry.lemma),
2369                time: entry.time,
2370                aspect: entry.aspect,
2371                class: entry.class,
2372            };
2373
2374            let mut alternatives = Vec::new();
2375            if is_noun {
2376                alternatives.push(TokenType::Noun(self.interner.intern(word)));
2377            }
2378            if is_adj {
2379                alternatives.push(TokenType::Adjective(self.interner.intern(word)));
2380            }
2381
2382            return TokenType::Ambiguous {
2383                primary: Box::new(verb_token),
2384                alternatives,
2385            };
2386        }
2387
2388        // Disambiguated to noun/adjective (not verb)
2389        if let Some(_) = &verb_entry {
2390            if is_disambiguated {
2391                let sym = self.interner.intern(word);
2392                if is_noun {
2393                    return TokenType::Noun(sym);
2394                }
2395                return TokenType::Adjective(sym);
2396            }
2397        }
2398
2399        // Pure verb
2400        if let Some(entry) = verb_entry {
2401            let sym = self.interner.intern(&entry.lemma);
2402            return TokenType::Verb {
2403                lemma: sym,
2404                time: entry.time,
2405                aspect: entry.aspect,
2406                class: entry.class,
2407            };
2408        }
2409
2410        // Pure noun
2411        if is_noun {
2412            let sym = self.interner.intern(word);
2413            return TokenType::Noun(sym);
2414        }
2415
2416        if lexicon::is_base_verb(&lower) {
2417            let sym = self.interner.intern(&Self::capitalize(&lower));
2418            let class = lexicon::lookup_verb_class(&lower);
2419            return TokenType::Verb {
2420                lemma: sym,
2421                time: Time::Present,
2422                aspect: Aspect::Simple,
2423                class,
2424            };
2425        }
2426
2427        if lower.ends_with("ian")
2428            || lower.ends_with("er")
2429            || lower == "logic"
2430            || lower == "time"
2431            || lower == "men"
2432            || lower == "book"
2433            || lower == "house"
2434            || lower == "code"
2435            || lower == "user"
2436        {
2437            let sym = self.interner.intern(word);
2438            return TokenType::Noun(sym);
2439        }
2440
2441        if lexicon::is_particle(&lower) {
2442            let sym = self.interner.intern(&lower);
2443            return TokenType::Particle(sym);
2444        }
2445
2446        let sym = self.interner.intern(word);
2447        TokenType::Adjective(sym)
2448    }
2449
2450    fn capitalize(s: &str) -> String {
2451        let mut chars = s.chars();
2452        match chars.next() {
2453            None => String::new(),
2454            Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
2455        }
2456    }
2457
2458    pub fn is_collective_verb(lemma: &str) -> bool {
2459        lexicon::is_collective_verb(&lemma.to_lowercase())
2460    }
2461
2462    pub fn is_mixed_verb(lemma: &str) -> bool {
2463        lexicon::is_mixed_verb(&lemma.to_lowercase())
2464    }
2465
2466    pub fn is_distributive_verb(lemma: &str) -> bool {
2467        lexicon::is_distributive_verb(&lemma.to_lowercase())
2468    }
2469
2470    pub fn is_intensional_predicate(lemma: &str) -> bool {
2471        lexicon::is_intensional_predicate(&lemma.to_lowercase())
2472    }
2473
2474    pub fn is_opaque_verb(lemma: &str) -> bool {
2475        lexicon::is_opaque_verb(&lemma.to_lowercase())
2476    }
2477
2478    pub fn is_ditransitive_verb(lemma: &str) -> bool {
2479        lexicon::is_ditransitive_verb(&lemma.to_lowercase())
2480    }
2481
2482    fn is_verb_like(&self, word: &str) -> bool {
2483        let lower = word.to_lowercase();
2484        if lexicon::is_infinitive_verb(&lower) {
2485            return true;
2486        }
2487        if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2488            return entry.lemma.len() > 0;
2489        }
2490        false
2491    }
2492
2493    pub fn is_subject_control_verb(lemma: &str) -> bool {
2494        lexicon::is_subject_control_verb(&lemma.to_lowercase())
2495    }
2496
2497    pub fn is_raising_verb(lemma: &str) -> bool {
2498        lexicon::is_raising_verb(&lemma.to_lowercase())
2499    }
2500
2501    pub fn is_object_control_verb(lemma: &str) -> bool {
2502        lexicon::is_object_control_verb(&lemma.to_lowercase())
2503    }
2504
2505    pub fn is_weather_verb(lemma: &str) -> bool {
2506        matches!(
2507            lemma.to_lowercase().as_str(),
2508            "rain" | "snow" | "hail" | "thunder" | "pour"
2509        )
2510    }
2511
2512    fn try_parse_superlative(&self, word: &str) -> Option<String> {
2513        if !word.ends_with("est") || word.len() < 5 {
2514            return None;
2515        }
2516
2517        let base = &word[..word.len() - 3];
2518
2519        if base.len() >= 2 {
2520            let chars: Vec<char> = base.chars().collect();
2521            let last = chars[chars.len() - 1];
2522            let second_last = chars[chars.len() - 2];
2523            if last == second_last && !"aeiou".contains(last) {
2524                let stem = &base[..base.len() - 1];
2525                if lexicon::is_gradable_adjective(stem) {
2526                    return Some(Self::capitalize(stem));
2527                }
2528            }
2529        }
2530
2531        if base.ends_with("i") {
2532            let stem = format!("{}y", &base[..base.len() - 1]);
2533            if lexicon::is_gradable_adjective(&stem) {
2534                return Some(Self::capitalize(&stem));
2535            }
2536        }
2537
2538        if lexicon::is_gradable_adjective(base) {
2539            return Some(Self::capitalize(base));
2540        }
2541
2542        None
2543    }
2544
2545    fn try_parse_comparative(&self, word: &str) -> Option<String> {
2546        if !word.ends_with("er") || word.len() < 4 {
2547            return None;
2548        }
2549
2550        let base = &word[..word.len() - 2];
2551
2552        if base.len() >= 2 {
2553            let chars: Vec<char> = base.chars().collect();
2554            let last = chars[chars.len() - 1];
2555            let second_last = chars[chars.len() - 2];
2556            if last == second_last && !"aeiou".contains(last) {
2557                let stem = &base[..base.len() - 1];
2558                if lexicon::is_gradable_adjective(stem) {
2559                    return Some(Self::capitalize(stem));
2560                }
2561            }
2562        }
2563
2564        if base.ends_with("i") {
2565            let stem = format!("{}y", &base[..base.len() - 1]);
2566            if lexicon::is_gradable_adjective(&stem) {
2567                return Some(Self::capitalize(&stem));
2568            }
2569        }
2570
2571        if lexicon::is_gradable_adjective(base) {
2572            return Some(Self::capitalize(base));
2573        }
2574
2575        None
2576    }
2577}
2578
2579#[cfg(test)]
2580mod tests {
2581    use super::*;
2582
2583    #[test]
2584    fn lexer_handles_apostrophe() {
2585        let mut interner = Interner::new();
2586        let mut lexer = Lexer::new("it's raining", &mut interner);
2587        let tokens = lexer.tokenize();
2588        assert!(!tokens.is_empty());
2589    }
2590
2591    #[test]
2592    fn lexer_handles_question_mark() {
2593        let mut interner = Interner::new();
2594        let mut lexer = Lexer::new("Is it raining?", &mut interner);
2595        let tokens = lexer.tokenize();
2596        assert!(!tokens.is_empty());
2597    }
2598
2599    #[test]
2600    fn ring_is_not_verb() {
2601        let mut interner = Interner::new();
2602        let mut lexer = Lexer::new("ring", &mut interner);
2603        let tokens = lexer.tokenize();
2604        assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2605    }
2606
2607    #[test]
2608    fn debug_that_token() {
2609        let mut interner = Interner::new();
2610        let mut lexer = Lexer::new("The cat that runs", &mut interner);
2611        let tokens = lexer.tokenize();
2612        for (i, t) in tokens.iter().enumerate() {
2613            let lex = interner.resolve(t.lexeme);
2614            eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2615        }
2616        let that_token = tokens.iter().find(|t| interner.resolve(t.lexeme) == "that");
2617        if let Some(t) = that_token {
2618            // Verify discriminant comparison works
2619            let check = std::mem::discriminant(&t.kind) == std::mem::discriminant(&TokenType::That);
2620            eprintln!("Discriminant check for That: {}", check);
2621            assert!(matches!(t.kind, TokenType::That), "'that' should be TokenType::That, got {:?}", t.kind);
2622        } else {
2623            panic!("No 'that' token found");
2624        }
2625    }
2626
2627    #[test]
2628    fn bus_is_not_verb() {
2629        let mut interner = Interner::new();
2630        let mut lexer = Lexer::new("bus", &mut interner);
2631        let tokens = lexer.tokenize();
2632        assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2633    }
2634
2635    #[test]
2636    fn lowercase_a_is_article() {
2637        let mut interner = Interner::new();
2638        let mut lexer = Lexer::new("a car", &mut interner);
2639        let tokens = lexer.tokenize();
2640        for (i, t) in tokens.iter().enumerate() {
2641            let lex = interner.resolve(t.lexeme);
2642            eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2643        }
2644        assert_eq!(tokens[0].kind, TokenType::Article(Definiteness::Indefinite));
2645        assert!(matches!(tokens[1].kind, TokenType::Noun(_)), "Expected Noun, got {:?}", tokens[1].kind);
2646    }
2647
2648    #[test]
2649    fn open_is_ambiguous() {
2650        let mut interner = Interner::new();
2651        let mut lexer = Lexer::new("open", &mut interner);
2652        let tokens = lexer.tokenize();
2653
2654        if let TokenType::Ambiguous { primary, alternatives } = &tokens[0].kind {
2655            assert!(matches!(**primary, TokenType::Verb { .. }), "Primary should be Verb");
2656            assert!(alternatives.iter().any(|t| matches!(t, TokenType::Adjective(_))),
2657                "Should have Adjective alternative");
2658        } else {
2659            panic!("Expected Ambiguous token for 'open', got {:?}", tokens[0].kind);
2660        }
2661    }
2662
2663    #[test]
2664    fn basic_tokenization() {
2665        let mut interner = Interner::new();
2666        let mut lexer = Lexer::new("All men are mortal.", &mut interner);
2667        let tokens = lexer.tokenize();
2668        assert_eq!(tokens[0].kind, TokenType::All);
2669        assert!(matches!(tokens[1].kind, TokenType::Noun(_)));
2670        assert_eq!(tokens[2].kind, TokenType::Are);
2671    }
2672
2673    #[test]
2674    fn iff_tokenizes_as_single_token() {
2675        let mut interner = Interner::new();
2676        let mut lexer = Lexer::new("A if and only if B", &mut interner);
2677        let tokens = lexer.tokenize();
2678        assert!(
2679            tokens.iter().any(|t| t.kind == TokenType::Iff),
2680            "should contain Iff token: got {:?}",
2681            tokens
2682        );
2683    }
2684
2685    #[test]
2686    fn is_equal_to_tokenizes_as_identity() {
2687        let mut interner = Interner::new();
2688        let mut lexer = Lexer::new("Socrates is equal to Socrates", &mut interner);
2689        let tokens = lexer.tokenize();
2690        assert!(
2691            tokens.iter().any(|t| t.kind == TokenType::Identity),
2692            "should contain Identity token: got {:?}",
2693            tokens
2694        );
2695    }
2696
2697    #[test]
2698    fn is_identical_to_tokenizes_as_identity() {
2699        let mut interner = Interner::new();
2700        let mut lexer = Lexer::new("Clark is identical to Superman", &mut interner);
2701        let tokens = lexer.tokenize();
2702        assert!(
2703            tokens.iter().any(|t| t.kind == TokenType::Identity),
2704            "should contain Identity token: got {:?}",
2705            tokens
2706        );
2707    }
2708
2709    #[test]
2710    fn itself_tokenizes_as_reflexive() {
2711        let mut interner = Interner::new();
2712        let mut lexer = Lexer::new("John loves itself", &mut interner);
2713        let tokens = lexer.tokenize();
2714        assert!(
2715            tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2716            "should contain Reflexive token: got {:?}",
2717            tokens
2718        );
2719    }
2720
2721    #[test]
2722    fn himself_tokenizes_as_reflexive() {
2723        let mut interner = Interner::new();
2724        let mut lexer = Lexer::new("John sees himself", &mut interner);
2725        let tokens = lexer.tokenize();
2726        assert!(
2727            tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2728            "should contain Reflexive token: got {:?}",
2729            tokens
2730        );
2731    }
2732
2733    #[test]
2734    fn to_stay_tokenizes_correctly() {
2735        let mut interner = Interner::new();
2736        let mut lexer = Lexer::new("to stay", &mut interner);
2737        let tokens = lexer.tokenize();
2738        assert!(
2739            tokens.iter().any(|t| t.kind == TokenType::To),
2740            "should contain To token: got {:?}",
2741            tokens
2742        );
2743        assert!(
2744            tokens.iter().any(|t| matches!(t.kind, TokenType::Verb { .. })),
2745            "should contain Verb token for stay: got {:?}",
2746            tokens
2747        );
2748    }
2749
2750    #[test]
2751    fn possessive_apostrophe_s() {
2752        let mut interner = Interner::new();
2753        let mut lexer = Lexer::new("John's dog", &mut interner);
2754        let tokens = lexer.tokenize();
2755        assert!(
2756            tokens.iter().any(|t| t.kind == TokenType::Possessive),
2757            "should contain Possessive token: got {:?}",
2758            tokens
2759        );
2760        assert!(
2761            tokens.iter().any(|t| matches!(&t.kind, TokenType::ProperName(_))),
2762            "should have John as proper name: got {:?}",
2763            tokens
2764        );
2765    }
2766
2767    #[test]
2768    fn lexer_produces_valid_spans() {
2769        let input = "All men are mortal.";
2770        let mut interner = Interner::new();
2771        let mut lexer = Lexer::new(input, &mut interner);
2772        let tokens = lexer.tokenize();
2773
2774        // "All" at 0..3
2775        assert_eq!(tokens[0].span.start, 0);
2776        assert_eq!(tokens[0].span.end, 3);
2777        assert_eq!(&input[tokens[0].span.start..tokens[0].span.end], "All");
2778
2779        // "men" at 4..7
2780        assert_eq!(tokens[1].span.start, 4);
2781        assert_eq!(tokens[1].span.end, 7);
2782        assert_eq!(&input[tokens[1].span.start..tokens[1].span.end], "men");
2783
2784        // "are" at 8..11
2785        assert_eq!(tokens[2].span.start, 8);
2786        assert_eq!(tokens[2].span.end, 11);
2787        assert_eq!(&input[tokens[2].span.start..tokens[2].span.end], "are");
2788
2789        // "mortal" at 12..18
2790        assert_eq!(tokens[3].span.start, 12);
2791        assert_eq!(tokens[3].span.end, 18);
2792        assert_eq!(&input[tokens[3].span.start..tokens[3].span.end], "mortal");
2793
2794        // "." at 18..19
2795        assert_eq!(tokens[4].span.start, 18);
2796        assert_eq!(tokens[4].span.end, 19);
2797
2798        // EOF at end
2799        assert_eq!(tokens[5].span.start, input.len());
2800        assert_eq!(tokens[5].kind, TokenType::EOF);
2801    }
2802
2803    #[test]
2804    fn triple_quote_produces_string_token() {
2805        let mut interner = Interner::new();
2806        let source = "## Main\nLet msg be \"\"\"\n    Hello\n    World\n\"\"\".\nShow msg.";
2807        let mut lexer = Lexer::new(source, &mut interner);
2808        let tokens = lexer.tokenize();
2809        // Dump all tokens for debugging
2810        for (i, t) in tokens.iter().enumerate() {
2811            let lex = interner.resolve(t.lexeme);
2812            eprintln!("Token[{}]: {:?} lex={:?} span={}..{}", i, t.kind, lex, t.span.start, t.span.end);
2813        }
2814        // Find the string token
2815        let str_token = tokens.iter().find(|t| matches!(t.kind, TokenType::StringLiteral(_) | TokenType::InterpolatedString(_)));
2816        assert!(str_token.is_some(), "Should have a string token. Tokens: {:?}", tokens.iter().map(|t| format!("{:?}", t.kind)).collect::<Vec<_>>());
2817        if let Some(tok) = str_token {
2818            let content = interner.resolve(tok.lexeme);
2819            eprintln!("Triple-quote content: {:?}", content);
2820            assert!(content.contains("Hello"), "Should contain Hello, got: {:?}", content);
2821        }
2822    }
2823}
logicaffeine_language/lexer.rs

logicaffeine_language/
lexer.rs