sigil_parser/
lexer.rs

1//! Lexer for the Sigil programming language.
2//!
3//! Handles polysynthetic morphemes, evidentiality markers, and multi-base numerals.
4
5use crate::span::Span;
6use logos::Logos;
7
8/// Process escape sequences in a string literal.
9/// Converts \n, \t, \r, \\, \", \', \0, \xNN, \u{NNNN} to their actual characters.
10/// Also handles line continuation: \<newline><whitespace> is stripped entirely.
11fn process_escape_sequences(s: &str) -> String {
12    let mut result = String::with_capacity(s.len());
13    let mut chars = s.chars().peekable();
14
15    while let Some(c) = chars.next() {
16        if c == '\\' {
17            match chars.next() {
18                Some('\n') => {
19                    // Line continuation: skip newline and any leading whitespace
20                    while let Some(&c) = chars.peek() {
21                        if c == ' ' || c == '\t' {
22                            chars.next();
23                        } else {
24                            break;
25                        }
26                    }
27                }
28                Some('n') => result.push('\n'),
29                Some('t') => result.push('\t'),
30                Some('r') => result.push('\r'),
31                Some('\\') => result.push('\\'),
32                Some('"') => result.push('"'),
33                Some('\'') => result.push('\''),
34                Some('0') => result.push('\0'),
35                Some('x') => {
36                    // \xNN - two hex digits
37                    let mut hex = String::new();
38                    for _ in 0..2 {
39                        if let Some(&c) = chars.peek() {
40                            if c.is_ascii_hexdigit() {
41                                hex.push(chars.next().unwrap());
42                            }
43                        }
44                    }
45                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
46                        result.push(val as char);
47                    }
48                }
49                Some('u') => {
50                    // \u{NNNN} - Unicode code point
51                    if chars.peek() == Some(&'{') {
52                        chars.next(); // consume '{'
53                        let mut hex = String::new();
54                        while let Some(&c) = chars.peek() {
55                            if c == '}' {
56                                chars.next();
57                                break;
58                            }
59                            if c.is_ascii_hexdigit() {
60                                hex.push(chars.next().unwrap());
61                            } else {
62                                break;
63                            }
64                        }
65                        if let Ok(val) = u32::from_str_radix(&hex, 16) {
66                            if let Some(c) = char::from_u32(val) {
67                                result.push(c);
68                            }
69                        }
70                    }
71                }
72                Some(other) => {
73                    // Unknown escape, keep as-is
74                    result.push('\\');
75                    result.push(other);
76                }
77                None => result.push('\\'),
78            }
79        } else {
80            result.push(c);
81        }
82    }
83    result
84}
85
86/// Process escape sequences in byte string literals, returning bytes.
87fn process_byte_escape_sequences(s: &str) -> Vec<u8> {
88    let mut result = Vec::with_capacity(s.len());
89    let mut chars = s.chars().peekable();
90
91    while let Some(c) = chars.next() {
92        if c == '\\' {
93            match chars.next() {
94                Some('\n') => {
95                    // Line continuation: skip newline and any leading whitespace
96                    while let Some(&c) = chars.peek() {
97                        if c == ' ' || c == '\t' {
98                            chars.next();
99                        } else {
100                            break;
101                        }
102                    }
103                }
104                Some('n') => result.push(b'\n'),
105                Some('t') => result.push(b'\t'),
106                Some('r') => result.push(b'\r'),
107                Some('\\') => result.push(b'\\'),
108                Some('"') => result.push(b'"'),
109                Some('\'') => result.push(b'\''),
110                Some('0') => result.push(0),
111                Some('x') => {
112                    // \xNN - two hex digits
113                    let mut hex = String::new();
114                    for _ in 0..2 {
115                        if let Some(&c) = chars.peek() {
116                            if c.is_ascii_hexdigit() {
117                                hex.push(chars.next().unwrap());
118                            }
119                        }
120                    }
121                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
122                        result.push(val);
123                    }
124                }
125                Some(other) => {
126                    // Unknown escape, keep as-is
127                    result.push(b'\\');
128                    if other.is_ascii() {
129                        result.push(other as u8);
130                    }
131                }
132                None => result.push(b'\\'),
133            }
134        } else if c.is_ascii() {
135            result.push(c as u8);
136        }
137        // Non-ASCII in byte strings is ignored (Rust doesn't allow it)
138    }
139    result
140}
141
142/// Callback for delimited raw strings (r#"..."#).
143/// Reads until the closing "# is found.
144/// Callback for block comments: /* ... */
145/// Consumes characters until */ is found
146fn block_comment_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
147    let remainder = lex.remainder();
148
149    // Find the closing */
150    if let Some(end_pos) = remainder.find("*/") {
151        let content = &remainder[..end_pos];
152        // Bump past content and closing */ (2 chars)
153        lex.bump(end_pos + 2);
154        Some(content.to_string())
155    } else {
156        // No closing */ found - consume rest as comment
157        let len = remainder.len();
158        lex.bump(len);
159        Some(remainder.to_string())
160    }
161}
162
163fn raw_string_delimited_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
164    let remainder = lex.remainder();
165
166    // Find the closing "#
167    if let Some(end_pos) = remainder.find("\"#") {
168        let content = &remainder[..end_pos];
169        // Bump past content and closing "# (2 chars)
170        lex.bump(end_pos + 2);
171        Some(content.to_string())
172    } else {
173        None
174    }
175}
176
177/// Callback for multi-line string literals.
178/// Reads from """ until the next """ is found.
179fn multiline_string_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
180    let remainder = lex.remainder();
181
182    // Find the closing """
183    if let Some(end_pos) = remainder.find("\"\"\"") {
184        let content = &remainder[..end_pos];
185        // Bump the lexer past the content and closing quotes
186        lex.bump(end_pos + 3);
187        Some(process_escape_sequences(content))
188    } else {
189        // No closing """ found - skip to end and return what we have
190        None
191    }
192}
193
194/// Process escape sequences in a character literal.
195fn process_char_escape(s: &str) -> char {
196    let mut chars = s.chars();
197    match chars.next() {
198        Some('\\') => match chars.next() {
199            Some('n') => '\n',
200            Some('t') => '\t',
201            Some('r') => '\r',
202            Some('\\') => '\\',
203            Some('"') => '"',
204            Some('\'') => '\'',
205            Some('0') => '\0',
206            Some('x') => {
207                let hex: String = chars.take(2).collect();
208                u8::from_str_radix(&hex, 16)
209                    .map(|v| v as char)
210                    .unwrap_or('?')
211            }
212            Some('u') => {
213                if chars.next() == Some('{') {
214                    let hex: String = chars.take_while(|&c| c != '}').collect();
215                    u32::from_str_radix(&hex, 16)
216                        .ok()
217                        .and_then(char::from_u32)
218                        .unwrap_or('?')
219                } else {
220                    '?'
221                }
222            }
223            Some(c) => c,
224            None => '?',
225        },
226        Some(c) => c,
227        None => '?',
228    }
229}
230
231/// Process escape sequences in a byte character literal (b'x').
232fn process_byte_char_escape(s: &str) -> u8 {
233    let mut chars = s.chars();
234    match chars.next() {
235        Some('\\') => match chars.next() {
236            Some('n') => b'\n',
237            Some('t') => b'\t',
238            Some('r') => b'\r',
239            Some('\\') => b'\\',
240            Some('"') => b'"',
241            Some('\'') => b'\'',
242            Some('0') => b'\0',
243            Some('x') => {
244                let hex: String = chars.take(2).collect();
245                u8::from_str_radix(&hex, 16).unwrap_or(b'?')
246            }
247            Some(c) => c as u8,
248            None => b'?',
249        },
250        Some(c) => c as u8,
251        None => b'?',
252    }
253}
254
255/// Token types for Sigil.
256#[derive(Logos, Debug, Clone, PartialEq)]
257#[logos(skip r"[ \t\r\n\f]+")]
258pub enum Token {
259    // === Evidential Doc Comments (SGDOC) ===
260    // These must come before LineComment to match first
261    // Format: //<marker> <content> or //<marker><marker> <content> for inner docs
262    // Inner docs (//!!, //~~, etc.) have priority 10 to match before outer docs
263
264    /// Verified doc comment: //! content (backed by test)
265    /// Inner variant: //!! documents the containing module
266    #[regex(r"//!![^\n]*", priority = 10, callback = |lex| lex.slice()[4..].trim().to_string())]
267    DocCommentVerifiedInner(String),
268    #[regex(r"//![^!\n][^\n]*", priority = 8, callback = |lex| lex.slice()[3..].trim().to_string())]
269    DocCommentVerified(String),
270
271    /// Reported doc comment: //~ content (from spec/author, default)
272    /// Inner variant: //~~ documents the containing module
273    #[regex(r"//~~[^\n]*", priority = 10, callback = |lex| lex.slice()[4..].trim().to_string())]
274    DocCommentReportedInner(String),
275    #[regex(r"//~[^~\n][^\n]*", priority = 8, callback = |lex| lex.slice()[3..].trim().to_string())]
276    DocCommentReported(String),
277
278    /// Uncertain doc comment: //? content (needs investigation)
279    /// Inner variant: //?? documents the containing module
280    #[regex(r"//\?\?[^\n]*", priority = 10, callback = |lex| lex.slice()[4..].trim().to_string())]
281    DocCommentUncertainInner(String),
282    #[regex(r"//\?[^\?\n][^\n]*", priority = 8, callback = |lex| lex.slice()[3..].trim().to_string())]
283    DocCommentUncertain(String),
284
285    /// Predicted doc comment: //◊ content (planned feature)
286    /// Inner variant: //◊◊ documents the containing module
287    #[regex(r"//◊◊[^\n]*", priority = 10, callback = |lex| {
288        // ◊ is multi-byte UTF-8, use trim_start_matches
289        lex.slice().trim_start_matches("//◊◊").trim().to_string()
290    })]
291    DocCommentPredictedInner(String),
292    #[regex(r"//◊[^◊\n][^\n]*", priority = 8, callback = |lex| {
293        lex.slice().trim_start_matches("//◊").trim().to_string()
294    })]
295    DocCommentPredicted(String),
296
297    /// Paradox doc comment: //‽ content (known inconsistency)
298    /// Inner variant: //‽‽ documents the containing module
299    #[regex(r"//‽‽[^\n]*", priority = 10, callback = |lex| {
300        // ‽ is multi-byte UTF-8, use trim_start_matches
301        lex.slice().trim_start_matches("//‽‽").trim().to_string()
302    })]
303    DocCommentParadoxInner(String),
304    #[regex(r"//‽[^‽\n][^\n]*", priority = 8, callback = |lex| {
305        lex.slice().trim_start_matches("//‽").trim().to_string()
306    })]
307    DocCommentParadox(String),
308
309    // === Regular Comments ===
310    // Legacy doc comment (maps to Reported for backwards compat)
311    // Must have priority > LineComment to match /// before //
312    #[regex(r"///[^\n]*", priority = 5, callback = |lex| lex.slice()[3..].trim().to_string())]
313    DocComment(String),
314
315    // Standard line comment (must be lower priority than doc comments)
316    #[regex(r"//[^\n]*", priority = 1, callback = |lex| lex.slice().to_string())]
317    LineComment(String),
318
319    // Tilde comment style: ~~ ... ~~
320    #[regex(r"~~[^\n]*", |lex| lex.slice().to_string())]
321    TildeComment(String),
322
323    // Block comment: /* ... */ (non-nested)
324    #[token("/*", block_comment_callback)]
325    BlockComment(String),
326
327    // === Deprecated Rust Syntax ===
328    // These tokens capture Rust-like syntax for helpful error messages
329    // The parser maps these to Sigil equivalents in error messages
330    #[token("fn", |lex| lex.slice().to_string())]
331    #[token("let", |lex| lex.slice().to_string())]
332    #[token("mut", |lex| lex.slice().to_string())]
333    #[token("struct", |lex| lex.slice().to_string())]
334    #[token("enum", |lex| lex.slice().to_string())]
335    #[token("trait", |lex| lex.slice().to_string())]
336    #[token("impl", |lex| lex.slice().to_string())]
337    #[token("mod", |lex| lex.slice().to_string())]
338    #[token("use", |lex| lex.slice().to_string())]
339    #[token("pub", |lex| lex.slice().to_string())]
340    #[token("if", |lex| lex.slice().to_string())]
341    #[token("else", |lex| lex.slice().to_string())]
342    #[token("match", |lex| lex.slice().to_string())]
343    #[token("while", |lex| lex.slice().to_string())]
344    #[token("for", |lex| lex.slice().to_string())]
345    #[token("in", |lex| lex.slice().to_string())]
346    #[token("break", |lex| lex.slice().to_string())]
347    #[token("continue", |lex| lex.slice().to_string())]
348    #[token("return", |lex| lex.slice().to_string())]
349    DeprecatedRustKeyword(String),
350
351    // Rust mutable reference &mut - use &Δ in Sigil
352    #[token("&mut")]
353    DeprecatedAmpMut,
354
355    // === Keywords (Native Sigil Syntax Only) ===
356    #[token("rite")]  // Rite - named function declaration (canonical Sigil keyword)
357    Fn,
358    #[token("async")]
359    #[token("⌛")]  // Hourglass - time/waiting (native symbol alternative)
360    Async,
361    #[token("≔")]  // Definition assignment
362    Let,
363    #[token("Δ")]  // Delta - change/mutable
364    #[token("vary")]  // Vary - mutable/changing (Sigil prose alternative)
365    Mut,
366    #[token("const")]
367    #[token("◆")]  // Diamond - solid, fixed (native symbol alternative)
368    Const,
369    #[token("linear")]
370    Linear,
371    #[token("affine")]
372    Affine,
373    #[token("relevant")]
374    Relevant,
375    #[token("type")]
376    Type,
377    #[token("sigil")]
378    #[token("Σ")]
379    Struct,
380    #[token("ᛈ")]  // Perthro rune - lot cup, choices/fate
381    Enum,
382    #[token("Θ")]  // Theta - theory, aspect
383    #[token("aspect")]  // Aspect - trait/interface (Sigil prose alternative)
384    Trait,
385    #[token("⊢")]  // Turnstile - proves/provides
386    Impl,
387    #[token("scroll")]
388    Mod,
389    #[token("invoke")]
390    Use,
391    #[token("☉")]  // Sun - visible, public
392    Pub,
393    #[token("actor")]
394    Actor,
395    #[token("saga")]
396    Saga,
397    #[token("scope")]
398    Scope,
399    #[token("rune")]
400    Rune,
401    #[token("macro")]
402    Macro,
403    #[token("macro_rules")]
404    MacroRules,
405
406    // Control flow (Native Sigil Syntax Only)
407    // Note: ∀ (ForAll token) is used contextually as `for` by parser
408    // Note: ∈ (ElementOf token) is used contextually as `in` by parser
409    // Note: ⊗ (Tensor token) is used contextually as `break` by parser
410    // Note: ↻ (CycleArrow token) is used contextually as `continue` by parser
411    // Note: ∞ (Infinity token) is used contextually as `loop` by parser
412    #[token("⎇")]  // ISO branch symbol
413    If,
414    #[token("⎉")]  // ISO alternative symbol
415    Else,
416    #[token("⌥")]  // Option key - choices
417    Match,
418    #[token("loop")]
419    #[token("forever")]  // Prose alternative for loop
420    Loop, // Legacy - parser also handles ∞ (Infinity token) for loop
421    #[token("⟳")]  // Cycle arrow
422    While,
423    // For - parser uses ForAll (∀) token contextually
424    // In - parser uses ElementOf (∈) token contextually
425    // Break - parser uses Tensor (⊗) token contextually
426    // Continue - parser uses CycleArrow (↻) token contextually
427    #[token("⤺")]  // Return arrow
428    Return,
429    #[token("yield")]
430    Yield,
431    #[token("await")]
432    Await,
433
434    // Other keywords
435    #[token("self")]
436    #[token("this")]  // This - current instance (Sigil prose alternative)
437    SelfLower,
438    // Note: ⊙ kept for Hadamard product - self uses keyword only
439    #[token("Self")]
440    #[token("This")]  // This - current type (Sigil prose alternative)
441    SelfUpper,
442    #[token("super")]
443    Super,
444    #[token("tome")]
445    Crate,
446    #[token("where")]
447    #[token("∋")]  // Contains as member - native where clause
448    Where,
449    #[token("as")]
450    As,
451    #[token("dyn")]
452    Dyn,
453    #[token("move")]
454    Move,
455    #[token("ref")]
456    Ref,
457    #[token("static")]
458    Static,
459    #[token("unsafe")]
460    Unsafe,
461    #[token("extern")]
462    Extern,
463    #[token("asm")]
464    Asm,
465    #[token("volatile")]
466    Volatile,
467    #[token("naked")]
468    Naked,
469    #[token("packed")]
470    Packed,
471    #[token("simd")]
472    Simd,
473    #[token("atomic")]
474    Atomic,
475    #[token("derive")]
476    Derive,
477    #[token("on")]
478    On,
479
480    // Plurality keywords (DAEMONIORUM extensions)
481    #[token("alter")]
482    Alter,
483    #[token("switch")]
484    Switch,
485    #[token("headspace")]
486    Headspace,
487    #[token("cocon")]
488    CoCon,
489    #[token("reality")]
490    Reality,
491    #[token("split")]
492    Split,
493    #[token("trigger")]
494    Trigger,
495    #[token("layer")]
496    Layer,
497    #[token("location")]
498    Location,
499    #[token("states")]
500    States,
501    #[token("anima")]
502    Anima,
503    #[token("to")]
504    To,
505    #[token("from")]
506    From,
507
508    // Alter-source markers (compound tokens)
509    #[token("@!")]
510    AlterSourceFronting,
511    #[token("@~")]
512    AlterSourceCoCon,
513    #[token("@?")]
514    AlterSourceDormant,
515    #[token("@‽")]
516    AlterSourceBlended,
517
518    // Boolean literals
519    #[token("true")]
520    #[token("yay")]
521    #[token("yea")]
522    True,
523    #[token("false")]
524    #[token("nay")]
525    False,
526
527    // Null literal
528    #[token("null")]
529    Null,
530
531    // === Morphemes (Greek letters) ===
532    #[token("τ")]
533    #[token("Τ")]
534    Tau, // Transform/map
535
536    #[token("φ")]
537    #[token("Φ")]
538    Phi, // Filter
539
540    #[token("σ")]
541    Sigma, // Sort morpheme (lowercase only - uppercase Σ is struct keyword)
542
543    #[token("ρ")]
544    #[token("Ρ")]
545    Rho, // Reduce
546
547    #[token("Λ")]
548    Lambda, // Lambda morpheme (uppercase Λ only - used in pipe morpheme contexts)
549
550    #[token("λ")]
551    LambdaExpr, // Lambda closure expression: λ(params) [→ RetType] { body }
552
553    #[token("Π")]
554    Pi, // Product
555
556    // Note: ⌛ (hourglass) is now mapped to Async keyword
557
558    // Additional morphemes
559    #[token("δ")]
560    Delta, // Difference/change morpheme (lowercase only - uppercase Δ is mut keyword)
561
562    #[token("ε")]
563    Epsilon, // Empty/null
564
565    #[token("ω")]
566    #[token("Ω")]
567    Omega, // End/terminal
568
569    #[token("α")]
570    Alpha, // First element
571
572    #[token("ζ")]
573    Zeta, // Zip/combine
574
575    // === Additional Access Morphemes ===
576    #[token("μ")]
577    #[token("Μ")]
578    Mu, // Middle/median element
579
580    #[token("χ")]
581    #[token("Χ")]
582    Chi, // Random/choice (from chaos)
583
584    #[token("ν")]
585    #[token("Ν")]
586    Nu, // Nth element (ordinal)
587
588    #[token("ξ")]
589    #[token("Ξ")]
590    Xi, // Next in sequence
591
592    #[token("ψ")]
593    #[token("Ψ")]
594    Psi, // Psychological/mental state
595
596    #[token("θ")]
597    Theta, // Threshold/angle morpheme (lowercase only - uppercase Θ is trait keyword)
598
599    #[token("κ")]
600    #[token("Κ")]
601    Kappa, // Callback/continuation
602
603    // === Parallel/Concurrency Morphemes ===
604    #[token("∥")]
605    #[token("parallel")]
606    Parallel, // Parallel execution (U+2225)
607
608    #[token("gpu")]
609    Gpu, // GPU compute shader
610
611    #[token("⊛")]
612    Convolve, // Convolution/merge operator (U+229B - circled asterisk)
613
614    // === Quantifiers (for AI-native set operations) ===
615    #[token("∀")]
616    #[token("each")]  // Prose alternative for impl blocks: ⊢ Trait each Type
617    ForAll, // Universal quantification (parser handles contextual use as `for` keyword)
618
619    #[token("∃")]
620    Exists, // Existential quantification
621
622    #[token("∈")]
623    #[token("of")]   // Prose alternative: each x of iter { }
624    ElementOf, // Membership test (parser handles contextual use as `in` keyword)
625
626    #[token("∉")]
627    NotElementOf, // Non-membership
628
629    // === Set Operations ===
630    #[token("∪")]
631    Union, // Set union
632
633    #[token("∩")]
634    Intersection, // Set intersection
635
636    #[token("∖")]
637    SetMinus, // Set difference
638
639    #[token("⊂")]
640    Subset, // Proper subset
641
642    #[token("⊆")]
643    SubsetEq, // Subset or equal
644
645    #[token("⊃")]
646    Superset, // Proper superset
647
648    #[token("⊇")]
649    SupersetEq, // Superset or equal
650
651    // === Logic Operators ===
652    #[token("∧")]
653    LogicAnd, // Logical conjunction
654
655    #[token("∨")]
656    LogicOr, // Logical disjunction
657
658    #[token("¬")]
659    LogicNot, // Logical negation
660
661    #[token("⊻")]
662    LogicXor, // Exclusive or
663
664    #[token("⊤")]
665    Top, // True/any type
666
667    #[token("⊥")]
668    Bottom, // False/never type
669
670    // === Bitwise Operators (Unicode) ===
671    #[token("⋏")]
672    BitwiseAndSymbol, // Bitwise AND (U+22CF)
673
674    #[token("⋎")]
675    BitwiseOrSymbol, // Bitwise OR (U+22CE)
676
677    #[token("⊙")]
678    CircledDot, // Hadamard product / element-wise multiply (U+2299)
679
680    // Note: ⊗ (tensor product) is already defined as Token::Tensor below
681
682    // === Type Theory ===
683    #[token("∷")]
684    TypeAnnotation, // Type annotation (alternative to :)
685
686    // === Analysis/Calculus ===
687    #[token("∫")]
688    Integral, // Cumulative sum
689
690    #[token("∂")]
691    Partial, // Discrete derivative
692
693    #[token("√")]
694    Sqrt, // Square root
695
696    #[token("∛")]
697    Cbrt, // Cube root
698
699    #[token("∇")]
700    Nabla, // Gradient (U+2207)
701
702    // === APL-Inspired Symbols ===
703    #[token("⍋")]
704    GradeUp, // Sort ascending (U+234B)
705
706    #[token("⍒")]
707    GradeDown, // Sort descending (U+2352)
708
709    #[token("⌽")]
710    Rotate, // Reverse/rotate (U+233D)
711
712    #[token("↻")]
713    #[token("⊳")]  // Right triangle - prose alternative for continue
714    CycleArrow, // Cycle/repeat (U+21BB)
715
716    #[token("⌺")]
717    QuadDiamond, // Windows/stencil (U+233A)
718
719    #[token("⊞")]
720    SquaredPlus, // Chunks (U+229E)
721
722    #[token("⍳")]
723    Iota, // Enumerate/index (U+2373)
724
725    // === Category Theory ===
726    #[token("∘")]
727    Compose, // Function composition
728
729    #[token("⊗")]
730    #[token("⊲")]  // Left triangle - prose alternative for break
731    Tensor, // Tensor product (parser also handles as break keyword)
732
733    #[token("⊕")]
734    DirectSum, // Direct sum / XOR
735
736    // === Data Operations ===
737    #[token("⋈")]
738    Bowtie, // Join/zip combining (U+22C8)
739
740    #[token("⋳")]
741    ElementSmallVerticalBar, // Flatten (U+22F3)
742
743    #[token("⊔")]
744    SquareCup, // Lattice join / supremum (U+2294)
745
746    #[token("⊓")]
747    SquareCap, // Lattice meet / infimum (U+2293)
748
749    // === Evidentiality Markers ===
750    // Note: These are handled contextually since ! and ? have other uses
751    #[token("‽")]
752    Interrobang, // Paradox/trust boundary (U+203D)
753
754    #[token("◊")]
755    Lozenge, // Predicted/speculative (U+25CA) - Token◊
756
757    #[token("□")]
758    BoxSquare, // Necessity/verification (U+25A1) - |□verify
759
760    // === Legion Morphemes (Holographic Agent Collective) ===
761    // From Infernum 2.0 - distributed memory and multi-agent coordination
762
763    #[token("∿")]
764    #[token("legion_field")]
765    LegionField, // Collective memory substrate (U+223F sine wave) - memory∿
766
767    #[token("⫰")]
768    #[token("interfere")]
769    Interfere, // Interference query (U+2AF0) - query ⫰ field∿
770
771    #[token("⟁")]
772    #[token("distribute")]
773    Distribute, // Holographic distribution (U+27C1) - task ⟁ 8
774
775    #[token("⟀")]
776    #[token("gather")]
777    Gather, // Interference gathering (U+27C0) - fragments ⟀
778
779    #[token("↠")]
780    #[token("broadcast")]
781    Broadcast, // One-to-many broadcast (U+21A0) - signal ↠ legion
782
783    #[token("⇢")]
784    #[token("consensus")]
785    Consensus, // Many-to-one consensus (U+21E2) - contributions ⇢
786
787    // Compound Legion operators
788    #[token("⊕=")]
789    DirectSumEq, // Superposition assign - field∿ ⊕= pattern
790
791    #[token("∂=")]
792    PartialEq_, // Decay assign - field∿ ∂= 0.95 (renamed to avoid std conflict)
793
794    #[token("⫰=")]
795    InterfereEq, // Interference assign
796
797    // === Affective Markers (Sentiment & Emotion) ===
798    // Sentiment polarity
799    #[token("⊖")]
800    AffectNegative, // Negative sentiment (U+2296 Circled Minus)
801
802    #[token("⊜")]
803    AffectNeutral, // Neutral sentiment (U+229C Circled Equals)
804
805    // Note: ⊕ (U+2295) is already DirectSum - we'll use it dual-purpose for positive sentiment
806
807    // Sarcasm/Irony
808    #[token("⸮")]
809    IronyMark, // Irony/sarcasm marker (U+2E2E - historical percontation point!)
810
811    // Intensity modifiers
812    #[token("↑")]
813    IntensityUp, // Intensifier (U+2191)
814
815    #[token("↓")]
816    IntensityDown, // Dampener (U+2193)
817
818    #[token("⇈")]
819    IntensityMax, // Maximum intensity (U+21C8)
820
821    // Formality register
822    #[token("♔")]
823    FormalRegister, // Formal (U+2654 White King)
824
825    #[token("♟")]
826    InformalRegister, // Informal (U+265F Black Pawn)
827
828    // Emotion markers (Plutchik's wheel)
829    #[token("☺")]
830    EmotionJoy, // Joy (U+263A)
831
832    #[token("☹")]
833    EmotionSadness, // Sadness (U+2639)
834
835    #[token("⚡")]
836    EmotionAnger, // Anger (U+26A1)
837
838    #[token("❄")]
839    EmotionFear, // Fear (U+2744)
840
841    #[token("✦")]
842    EmotionSurprise, // Surprise (U+2726)
843
844    #[token("♡")]
845    EmotionLove, // Love/Trust (U+2661)
846
847    // Confidence markers
848    #[token("◉")]
849    ConfidenceHigh, // High confidence (U+25C9)
850
851    #[token("◎")]
852    ConfidenceMedium, // Medium confidence (U+25CE)
853
854    #[token("○")]
855    ConfidenceLow, // Low confidence (U+25CB)
856
857    // === Aspect Morphemes (verb aspects) ===
858    #[token("·ing")]
859    AspectProgressive, // Ongoing/streaming aspect
860
861    #[token("·ed")]
862    AspectPerfective, // Completed aspect
863
864    #[token("·able")]
865    AspectPotential, // Capability aspect
866
867    #[token("·ive")]
868    AspectResultative, // Result-producing aspect
869
870    // === Operators ===
871    #[token("|")]
872    Pipe,
873    #[token("·")]
874    MiddleDot, // Incorporation
875    #[token("->")]
876    #[token("→")]  // Unicode arrow (U+2192) - native Sigil syntax
877    Arrow,
878    #[token("=>")]
879    FatArrow,
880    #[token("<-")]
881    LeftArrow,
882    #[token("==")]
883    EqEq,
884    #[token("!=")]
885    NotEq,
886    #[token("<=")]
887    LtEq,
888    #[token(">=")]
889    GtEq,
890    #[token("<")]
891    Lt,
892    #[token(">")]
893    Gt,
894    #[token("+")]
895    Plus,
896    #[token("-")]
897    Minus,
898    #[token("*")]
899    Star,
900    #[token("/")]
901    Slash,
902    #[token("%")]
903    Percent,
904    #[token("**")]
905    StarStar, // Exponentiation
906    #[token("&&")]
907    AndAnd,
908    #[token("||")]
909    OrOr,
910    #[token("!")]
911    Bang, // Evidentiality: known / logical not
912    #[token("?")]
913    Question, // Evidentiality: uncertain / try
914    #[token("~")]
915    Tilde, // Evidentiality: reported
916    #[token("&")]
917    Amp,
918    #[token("^")]
919    Caret,
920    #[token("<<=")]
921    ShlEq,
922    #[token(">>=")]
923    ShrEq,
924    #[token("<<")]
925    Shl,
926    #[token(">>")]
927    Shr,
928    #[token("=")]
929    Eq,
930    #[token("+=")]
931    PlusEq,
932    #[token("-=")]
933    MinusEq,
934    #[token("*=")]
935    StarEq,
936    #[token("/=")]
937    SlashEq,
938    #[token("%=")]
939    PercentEq,
940    #[token("|=")]
941    PipeEq,
942    #[token("&=")]
943    AmpEq,
944    #[token("^=")]
945    CaretEq,
946    #[token("..")]
947    DotDot,
948    #[token("..=")]
949    DotDotEq,
950    #[token("++")]
951    PlusPlus, // Concatenation
952    // Deprecated Rust operator - use · (middledot) for paths
953    #[token("::")]
954    DeprecatedColonColon,
955    #[token(":")]
956    Colon,
957    #[token(";")]
958    Semi,
959    #[token(",")]
960    Comma,
961    #[token(".")]
962    Dot,
963    #[token("@")]
964    At,
965    #[token("$")]
966    Dollar, // For macro parameters like $n, $__pipe
967    #[token("#!")]
968    HashBang, // Inner attribute prefix #![...]
969    #[token("#")]
970    Hash,
971    #[token("_", priority = 3)]
972    Underscore,
973
974    // === Delimiters ===
975    #[token("(")]
976    LParen,
977    #[token(")")]
978    RParen,
979    #[token("{")]
980    LBrace,
981    #[token("}")]
982    RBrace,
983    #[token("[")]
984    LBracket,
985    #[token("]")]
986    RBracket,
987
988    // === Special symbols ===
989    #[token("∅")]
990    Empty, // Void/emptiness (śūnya)
991    #[token("◯")]
992    Circle, // Geometric zero
993    #[token("∞")]
994    Infinity, // Infinite value (parser handles contextual use for loop keyword)
995
996    // === Protocol Operations (Sigil-native networking) ===
997    #[token("⇒")]
998    ProtoSend, // Send data (U+21D2 - rightwards double arrow)
999
1000    #[token("⇐")]
1001    ProtoRecv, // Receive data (U+21D0 - leftwards double arrow)
1002
1003    #[token("≋")]
1004    ProtoStream, // Stream data (U+224B - triple tilde)
1005
1006    #[token("⊸")]
1007    ProtoConnect, // Connect/lollipop (U+22B8 - multimap)
1008
1009    #[token("⏱")]
1010    ProtoTimeout, // Timeout (U+23F1 - stopwatch)
1011
1012    // Note: ⊗ (Tensor) is used for close in protocol contexts
1013
1014    // Protocol keywords for ASCII fallback
1015    #[token("send")]
1016    Send,
1017    #[token("recv")]
1018    Recv,
1019    #[token("stream")]
1020    Stream,
1021    #[token("connect")]
1022    Connect,
1023    #[token("close")]
1024    Close,
1025    #[token("timeout")]
1026    Timeout,
1027    #[token("retry")]
1028    Retry,
1029    #[token("header")]
1030    Header,
1031    #[token("body")]
1032    Body,
1033
1034    // Protocol type identifiers (for incorporation: http·, ws·, grpc·, kafka·)
1035    #[token("http")]
1036    Http,
1037    #[token("https")]
1038    Https,
1039    #[token("ws")]
1040    Ws,
1041    #[token("wss")]
1042    Wss,
1043    #[token("grpc")]
1044    Grpc,
1045    #[token("kafka")]
1046    Kafka,
1047    #[token("amqp")]
1048    Amqp,
1049    #[token("graphql")]
1050    GraphQL,
1051
1052    // === Numbers ===
1053    // Binary: 0b... with optional type suffix
1054    #[regex(r"0b[01_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
1055    BinaryLit(String),
1056
1057    // Octal: 0o... with optional type suffix
1058    #[regex(r"0o[0-7_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
1059    OctalLit(String),
1060
1061    // Hex: 0x... with optional type suffix
1062    #[regex(r"0x[0-9a-fA-F_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
1063    HexLit(String),
1064
1065    // Vigesimal: 0v... (base 20)
1066    #[regex(r"0v[0-9a-jA-J_]+", |lex| lex.slice().to_string())]
1067    VigesimalLit(String),
1068
1069    // Sexagesimal: 0s... (base 60)
1070    #[regex(r"0s[0-9a-zA-Z_]+", |lex| lex.slice().to_string())]
1071    SexagesimalLit(String),
1072
1073    // Duodecimal: 0z... (base 12)
1074    #[regex(r"0z[0-9a-bA-B_]+", |lex| lex.slice().to_string())]
1075    DuodecimalLit(String),
1076
1077    // Float: 123.456 or 1.23e10 or 1e-15 (with or without decimal point if exponent present)
1078    // Optional type suffix: f16, f32, f64, f128 (with optional underscore separator, e.g. 2.0_f64)
1079    #[regex(r"([0-9][0-9_]*\.[0-9][0-9_]*([eE][+-]?[0-9_]+)?|[0-9][0-9_]*[eE][+-]?[0-9_]+)_?(f16|f32|f64|f128)?", |lex| lex.slice().to_string())]
1080    FloatLit(String),
1081
1082    // Integer: 123 with optional type suffix (i8, i16, i32, i64, i128, isize, u8, u16, u32, u64, u128, usize)
1083    // Optional underscore separator before suffix (e.g. 42_i32)
1084    #[regex(r"[0-9][0-9_]*_?(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
1085    IntLit(String),
1086
1087    // === Strings ===
1088    // Regular string with escape sequence processing
1089    // Note: \\(.|\n) handles both regular escapes and line continuation (\ at end of line)
1090    #[regex(r#""([^"\\]|\\(.|\n))*""#, |lex| {
1091        let s = lex.slice();
1092        let inner = &s[1..s.len()-1];
1093        process_escape_sequences(inner)
1094    })]
1095    StringLit(String),
1096
1097    // Multi-line string (triple-quoted) - handled via callback
1098    #[token(r#"""""#, multiline_string_callback)]
1099    MultiLineStringLit(String),
1100
1101    // Byte string literal
1102    #[regex(r#"b"([^"\\]|\\.)*""#, |lex| {
1103        let s = lex.slice();
1104        let inner = &s[2..s.len()-1];
1105        process_byte_escape_sequences(inner)
1106    })]
1107    ByteStringLit(Vec<u8>),
1108
1109    // Interpolated string (will be parsed further for expressions)
1110    #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
1111        let s = lex.slice();
1112        let inner = &s[2..s.len()-1];
1113        process_escape_sequences(inner)
1114    })]
1115    InterpolatedStringLit(String),
1116
1117    // Sigil string - SQL template (σ prefix)
1118    #[regex(r#"σ"([^"\\]|\\.)*""#, |lex| {
1119        let s = lex.slice();
1120        // Get byte index after the σ character (which is 2 bytes in UTF-8)
1121        let start = "σ".len() + 1; // σ + opening quote
1122        let inner = &s[start..s.len()-1];
1123        process_escape_sequences(inner)
1124    })]
1125    SigilStringSql(String),
1126
1127    // Sigil string - Route template (ρ prefix)
1128    #[regex(r#"ρ"([^"\\]|\\.)*""#, |lex| {
1129        let s = lex.slice();
1130        // Get byte index after the ρ character (which is 2 bytes in UTF-8)
1131        let start = "ρ".len() + 1; // ρ + opening quote
1132        let inner = &s[start..s.len()-1];
1133        process_escape_sequences(inner)
1134    })]
1135    SigilStringRoute(String),
1136
1137    // Char literal with escape sequence processing
1138    // Matches: single char, hex escape \xNN, unicode escape \u{N...}, or simple escape \c
1139    #[regex(r"'([^'\\]|\\x[0-9a-fA-F]{2}|\\u\{[0-9a-fA-F]{1,6}\}|\\.)'", |lex| {
1140        let s = lex.slice();
1141        let inner = &s[1..s.len()-1];
1142        process_char_escape(inner)
1143    })]
1144    CharLit(char),
1145
1146    // Byte char literal (b'x' or b'\n')
1147    #[regex(r"b'([^'\\]|\\x[0-9a-fA-F]{2}|\\.)'", |lex| {
1148        let s = lex.slice();
1149        // Extract the character between b' and '
1150        let inner = &s[2..s.len()-1];
1151        process_byte_char_escape(inner)
1152    })]
1153    ByteCharLit(u8),
1154
1155    // Raw string (no escape processing, but allows \" for literal quotes in patterns)
1156    #[regex(r#"r"([^"\\]|\\.)*""#, |lex| {
1157        let s = lex.slice();
1158        s[2..s.len()-1].to_string()
1159    })]
1160    RawStringLit(String),
1161
1162    // Raw string with delimiter (r#"..."# style) - handles internal quotes
1163    #[token(r##"r#""##, raw_string_delimited_callback)]
1164    RawStringDelimited(String),
1165
1166    // === Lifetime/Label (for loop labels like 'outer: loop { break 'outer }) ===
1167    #[regex(r"'[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice()[1..].to_string())]
1168    Lifetime(String),
1169
1170    // === Identifiers ===
1171    // Includes Greek letters for polysynthetic identifiers like compute_ψ_state
1172    // Greek letters (both cases): αΑ, βΒ, γΓ, δΔ, εΕ, ζΖ, ηΗ, θΘ, ιΙ, κΚ, λΛ, μΜ, νΝ, ξΞ, οΟ, πΠ, ρΡ, σΣ, τΤ, υΥ, φΦ, χΧ, ψΨ, ωΩ
1173    #[regex(r"[a-zA-Z_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ][a-zA-Z0-9_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]*", |lex| lex.slice().to_string())]
1174    Ident(String),
1175
1176    // === Rune annotation ===
1177    #[regex(r"//@\s*rune:\s*[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
1178    RuneAnnotation(String),
1179}
1180
1181impl Token {
1182    pub fn is_keyword(&self) -> bool {
1183        matches!(
1184            self,
1185            Token::Fn
1186                | Token::Async
1187                | Token::Let
1188                | Token::Mut
1189                | Token::Const
1190                | Token::Linear
1191                | Token::Affine
1192                | Token::Relevant
1193                | Token::Type
1194                | Token::Struct
1195                | Token::Enum
1196                | Token::Trait
1197                | Token::Impl
1198                | Token::Mod
1199                | Token::Use
1200                | Token::Pub
1201                | Token::Actor
1202                | Token::Saga
1203                | Token::Scope
1204                | Token::Rune
1205                | Token::If
1206                | Token::Else
1207                | Token::Match
1208                | Token::Loop
1209                | Token::While
1210                | Token::ForAll      // ∀ - used as for keyword
1211                | Token::ElementOf   // ∈ - used as in keyword
1212                | Token::Tensor      // ⊗ - used as break keyword
1213                | Token::CycleArrow  // ↻ - used as continue keyword
1214                | Token::Return
1215                | Token::Yield
1216                | Token::Await
1217        ) || self.is_plurality_keyword()
1218    }
1219
1220    pub fn is_plurality_keyword(&self) -> bool {
1221        matches!(
1222            self,
1223            Token::Alter
1224                | Token::Switch
1225                | Token::Headspace
1226                | Token::CoCon
1227                | Token::Reality
1228                | Token::Split
1229                | Token::Trigger
1230                | Token::Layer
1231                | Token::Location
1232                | Token::States
1233                | Token::Anima
1234                | Token::To
1235                | Token::From
1236        )
1237    }
1238
1239    pub fn is_alter_source(&self) -> bool {
1240        matches!(
1241            self,
1242            Token::AlterSourceFronting
1243                | Token::AlterSourceCoCon
1244                | Token::AlterSourceDormant
1245                | Token::AlterSourceBlended
1246        )
1247    }
1248
1249    pub fn is_morpheme(&self) -> bool {
1250        matches!(
1251            self,
1252            Token::Tau | Token::Phi | Token::Sigma | Token::Rho |
1253            Token::Lambda | Token::Pi | Token::Async |
1254            Token::Delta | Token::Epsilon | Token::Omega | Token::Alpha | Token::Zeta |
1255            Token::Mu | Token::Chi | Token::Nu | Token::Xi |  // Access morphemes
1256            Token::Parallel | Token::Gpu |  // Concurrency morphemes
1257            Token::Integral | Token::Partial | Token::Sqrt | Token::Cbrt |
1258            Token::Compose
1259        )
1260    }
1261
1262    pub fn is_aspect(&self) -> bool {
1263        matches!(
1264            self,
1265            Token::AspectProgressive
1266                | Token::AspectPerfective
1267                | Token::AspectPotential
1268                | Token::AspectResultative
1269        )
1270    }
1271
1272    pub fn is_data_op(&self) -> bool {
1273        matches!(
1274            self,
1275            Token::Bowtie | Token::ElementSmallVerticalBar | Token::SquareCup | Token::SquareCap
1276        )
1277    }
1278
1279    pub fn is_bitwise_symbol(&self) -> bool {
1280        matches!(self, Token::BitwiseAndSymbol | Token::BitwiseOrSymbol)
1281    }
1282
1283    pub fn is_quantifier(&self) -> bool {
1284        matches!(
1285            self,
1286            Token::ForAll | Token::Exists | Token::ElementOf | Token::NotElementOf
1287        )
1288    }
1289
1290    pub fn is_set_op(&self) -> bool {
1291        matches!(
1292            self,
1293            Token::Union
1294                | Token::Intersection
1295                | Token::SetMinus
1296                | Token::Subset
1297                | Token::SubsetEq
1298                | Token::Superset
1299                | Token::SupersetEq
1300        )
1301    }
1302
1303    pub fn is_logic_op(&self) -> bool {
1304        matches!(
1305            self,
1306            Token::LogicAnd
1307                | Token::LogicOr
1308                | Token::LogicNot
1309                | Token::LogicXor
1310                | Token::Top
1311                | Token::Bottom
1312        )
1313    }
1314
1315    pub fn is_evidentiality(&self) -> bool {
1316        matches!(
1317            self,
1318            Token::Bang | Token::Question | Token::Tilde | Token::Interrobang | Token::Lozenge
1319        )
1320    }
1321
1322    pub fn is_legion_morpheme(&self) -> bool {
1323        matches!(
1324            self,
1325            Token::LegionField      // ∿ - collective memory
1326                | Token::DirectSum  // ⊕ - superposition
1327                | Token::Interfere  // ⫰ - interference
1328                | Token::ConfidenceHigh  // ◉ - resonance (dual-purpose)
1329                | Token::Distribute // ⟁ - holographic distribution
1330                | Token::Gather     // ⟀ - interference gathering
1331                | Token::Broadcast  // ↠ - one-to-many
1332                | Token::Consensus  // ⇢ - many-to-one
1333                | Token::Partial    // ∂ - decay
1334        )
1335    }
1336
1337    pub fn is_legion_assign(&self) -> bool {
1338        matches!(
1339            self,
1340            Token::DirectSumEq | Token::PartialEq_ | Token::InterfereEq
1341        )
1342    }
1343
1344    pub fn is_affective(&self) -> bool {
1345        matches!(
1346            self,
1347            // Sentiment
1348            Token::DirectSum |  // ⊕ positive (dual-purpose with DirectSum)
1349            Token::AffectNegative |  // ⊖ negative
1350            Token::AffectNeutral |  // ⊜ neutral
1351            // Sarcasm
1352            Token::IronyMark |  // ⸮ irony/sarcasm
1353            // Intensity
1354            Token::IntensityUp |  // ↑
1355            Token::IntensityDown |  // ↓
1356            Token::IntensityMax |  // ⇈
1357            // Formality
1358            Token::FormalRegister |  // ♔
1359            Token::InformalRegister |  // ♟
1360            // Emotions
1361            Token::EmotionJoy |  // ☺
1362            Token::EmotionSadness |  // ☹
1363            Token::EmotionAnger |  // ⚡
1364            Token::EmotionFear |  // ❄
1365            Token::EmotionSurprise |  // ✦
1366            Token::EmotionLove |  // ♡
1367            // Confidence
1368            Token::ConfidenceHigh |  // ◉
1369            Token::ConfidenceMedium |  // ◎
1370            Token::ConfidenceLow // ○
1371        )
1372    }
1373
1374    pub fn is_sentiment(&self) -> bool {
1375        matches!(
1376            self,
1377            Token::DirectSum | Token::AffectNegative | Token::AffectNeutral
1378        )
1379    }
1380
1381    pub fn is_emotion(&self) -> bool {
1382        matches!(
1383            self,
1384            Token::EmotionJoy
1385                | Token::EmotionSadness
1386                | Token::EmotionAnger
1387                | Token::EmotionFear
1388                | Token::EmotionSurprise
1389                | Token::EmotionLove
1390        )
1391    }
1392
1393    pub fn is_intensity(&self) -> bool {
1394        matches!(
1395            self,
1396            Token::IntensityUp | Token::IntensityDown | Token::IntensityMax
1397        )
1398    }
1399
1400    /// Returns true if this token is any kind of doc comment (evidential or legacy)
1401    pub fn is_doc_comment(&self) -> bool {
1402        matches!(
1403            self,
1404            Token::DocCommentVerified(_)
1405                | Token::DocCommentVerifiedInner(_)
1406                | Token::DocCommentReported(_)
1407                | Token::DocCommentReportedInner(_)
1408                | Token::DocCommentUncertain(_)
1409                | Token::DocCommentUncertainInner(_)
1410                | Token::DocCommentPredicted(_)
1411                | Token::DocCommentPredictedInner(_)
1412                | Token::DocCommentParadox(_)
1413                | Token::DocCommentParadoxInner(_)
1414                | Token::DocComment(_)  // Legacy /// comments map to Reported
1415        )
1416    }
1417
1418    /// Returns true if this is an inner doc comment (documents the enclosing item)
1419    pub fn is_inner_doc_comment(&self) -> bool {
1420        matches!(
1421            self,
1422            Token::DocCommentVerifiedInner(_)
1423                | Token::DocCommentReportedInner(_)
1424                | Token::DocCommentUncertainInner(_)
1425                | Token::DocCommentPredictedInner(_)
1426                | Token::DocCommentParadoxInner(_)
1427        )
1428    }
1429
1430    /// Returns the evidentiality marker for a doc comment (!, ~, ?, ◊, ‽)
1431    /// Returns '~' (Reported) for legacy /// comments as default
1432    pub fn doc_comment_evidentiality(&self) -> Option<char> {
1433        match self {
1434            Token::DocCommentVerified(_) | Token::DocCommentVerifiedInner(_) => Some('!'),
1435            Token::DocCommentReported(_) | Token::DocCommentReportedInner(_) => Some('~'),
1436            Token::DocCommentUncertain(_) | Token::DocCommentUncertainInner(_) => Some('?'),
1437            Token::DocCommentPredicted(_) | Token::DocCommentPredictedInner(_) => Some('◊'),
1438            Token::DocCommentParadox(_) | Token::DocCommentParadoxInner(_) => Some('‽'),
1439            Token::DocComment(_) => Some('~'),  // Legacy defaults to Reported
1440            _ => None,
1441        }
1442    }
1443
1444    /// Extracts the content string from a doc comment token
1445    pub fn doc_comment_content(&self) -> Option<&str> {
1446        match self {
1447            Token::DocCommentVerified(s)
1448            | Token::DocCommentVerifiedInner(s)
1449            | Token::DocCommentReported(s)
1450            | Token::DocCommentReportedInner(s)
1451            | Token::DocCommentUncertain(s)
1452            | Token::DocCommentUncertainInner(s)
1453            | Token::DocCommentPredicted(s)
1454            | Token::DocCommentPredictedInner(s)
1455            | Token::DocCommentParadox(s)
1456            | Token::DocCommentParadoxInner(s)
1457            | Token::DocComment(s) => Some(s.as_str()),
1458            _ => None,
1459        }
1460    }
1461}
1462
1463/// Lexer wrapping Logos for Sigil.
1464pub struct Lexer<'a> {
1465    inner: logos::Lexer<'a, Token>,
1466    /// Buffer for lookahead tokens (supports multi-token peek)
1467    buffer: Vec<Option<(Token, Span)>>,
1468}
1469
1470impl<'a> Lexer<'a> {
1471    pub fn new(source: &'a str) -> Self {
1472        Self {
1473            inner: Token::lexer(source),
1474            buffer: Vec::new(),
1475        }
1476    }
1477
1478    /// Read the next token from the underlying logos lexer
1479    fn read_next(&mut self) -> Option<(Token, Span)> {
1480        match self.inner.next() {
1481            Some(Ok(token)) => {
1482                let span = self.inner.span();
1483                Some((token, Span::new(span.start, span.end)))
1484            }
1485            Some(Err(_)) => {
1486                // Skip invalid tokens and try next
1487                self.read_next()
1488            }
1489            None => None,
1490        }
1491    }
1492
1493    pub fn next_token(&mut self) -> Option<(Token, Span)> {
1494        if !self.buffer.is_empty() {
1495            // Return from buffer (front = next token)
1496            // Each buffer element is Option<(Token, Span)> where None = EOF
1497            return self.buffer.remove(0);
1498        }
1499        self.read_next()
1500    }
1501
1502    pub fn peek(&mut self) -> Option<&(Token, Span)> {
1503        self.peek_n(0)
1504    }
1505
1506    /// Peek n tokens ahead (0 = next token, 1 = token after that, etc.)
1507    pub fn peek_n(&mut self, n: usize) -> Option<&(Token, Span)> {
1508        // Fill buffer up to position n
1509        while self.buffer.len() <= n {
1510            let token = self.read_next();
1511            self.buffer.push(token);
1512        }
1513        self.buffer.get(n).and_then(|opt| opt.as_ref())
1514    }
1515
1516    pub fn span(&self) -> Span {
1517        let span = self.inner.span();
1518        Span::new(span.start, span.end)
1519    }
1520}
1521
1522#[cfg(test)]
1523mod tests {
1524    use super::*;
1525
1526    #[test]
1527    fn test_morphemes() {
1528        // Note: Case matters for Greek letters in Sigil:
1529        // - lowercase λ = Token::LambdaExpr (closure expression), uppercase Λ = Token::Lambda (morpheme)
1530        // - lowercase σ = Token::Sigma (sort morpheme), uppercase Σ = Token::Struct (keyword)
1531        // - lowercase π = identifier, uppercase Π = Token::Pi (product morpheme)
1532        // This test verifies uppercase morphemes that are NOT keywords
1533        let mut lexer = Lexer::new("τ φ σ ρ Λ Π ⌛");
1534        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));      // τ = Tau (transform)
1535        assert!(matches!(lexer.next_token(), Some((Token::Phi, _))));      // φ = Phi (filter)
1536        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));    // σ = Sigma (sort/sum)
1537        assert!(matches!(lexer.next_token(), Some((Token::Rho, _))));      // ρ = Rho (reduce)
1538        assert!(matches!(lexer.next_token(), Some((Token::Lambda, _))));   // Λ = Lambda (uppercase)
1539        assert!(matches!(lexer.next_token(), Some((Token::Pi, _))));       // Π = Pi (product)
1540        assert!(matches!(lexer.next_token(), Some((Token::Async, _))));    // ⌛ = Async
1541    }
1542
1543    #[test]
1544    fn test_evidentiality() {
1545        let mut lexer = Lexer::new("value! uncertain? reported~ paradox‽");
1546        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "value"));
1547        assert!(matches!(lexer.next_token(), Some((Token::Bang, _))));
1548        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "uncertain"));
1549        assert!(matches!(lexer.next_token(), Some((Token::Question, _))));
1550        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "reported"));
1551        assert!(matches!(lexer.next_token(), Some((Token::Tilde, _))));
1552        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "paradox"));
1553        assert!(matches!(lexer.next_token(), Some((Token::Interrobang, _))));
1554    }
1555
1556    #[test]
1557    fn test_pipe_chain() {
1558        let mut lexer = Lexer::new("data|τ{f}|φ{p}|σ");
1559        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "data"));
1560        assert!(matches!(lexer.next_token(), Some((Token::Pipe, _))));
1561        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1562        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1563        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1564        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1565    }
1566
1567    #[test]
1568    fn test_numbers() {
1569        let mut lexer = Lexer::new("42 0b1010 0o52 0x2A 0v22 0s42 3.14");
1570        assert!(matches!(lexer.next_token(), Some((Token::IntLit(s), _)) if s == "42"));
1571        assert!(matches!(lexer.next_token(), Some((Token::BinaryLit(s), _)) if s == "0b1010"));
1572        assert!(matches!(lexer.next_token(), Some((Token::OctalLit(s), _)) if s == "0o52"));
1573        assert!(matches!(lexer.next_token(), Some((Token::HexLit(s), _)) if s == "0x2A"));
1574        assert!(matches!(lexer.next_token(), Some((Token::VigesimalLit(s), _)) if s == "0v22"));
1575        assert!(matches!(lexer.next_token(), Some((Token::SexagesimalLit(s), _)) if s == "0s42"));
1576        assert!(matches!(lexer.next_token(), Some((Token::FloatLit(s), _)) if s == "3.14"));
1577    }
1578
1579    #[test]
1580    fn test_incorporation() {
1581        let mut lexer = Lexer::new("file·open·read");
1582        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "file"));
1583        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1584        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "open"));
1585        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1586        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "read"));
1587    }
1588
1589    #[test]
1590    fn test_special_symbols() {
1591        let mut lexer = Lexer::new("∅ ◯ ∞");
1592        assert!(matches!(lexer.next_token(), Some((Token::Empty, _))));
1593        assert!(matches!(lexer.next_token(), Some((Token::Circle, _))));
1594        assert!(matches!(lexer.next_token(), Some((Token::Infinity, _))));
1595    }
1596
1597    #[test]
1598    fn test_quantifiers() {
1599        let mut lexer = Lexer::new("∀x ∃y x∈S y∉T");
1600        assert!(matches!(lexer.next_token(), Some((Token::ForAll, _))));
1601        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1602        assert!(matches!(lexer.next_token(), Some((Token::Exists, _))));
1603        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "y"));
1604        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1605        assert!(matches!(lexer.next_token(), Some((Token::ElementOf, _))));
1606    }
1607
1608    #[test]
1609    fn test_set_operations() {
1610        let mut lexer = Lexer::new("A∪B A∩B A∖B A⊂B A⊆B");
1611        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1612        assert!(matches!(lexer.next_token(), Some((Token::Union, _))));
1613        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "B"));
1614        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1615        assert!(matches!(lexer.next_token(), Some((Token::Intersection, _))));
1616    }
1617
1618    #[test]
1619    fn test_logic_operators() {
1620        let mut lexer = Lexer::new("p∧q p∨q ¬p p⊻q ⊤ ⊥");
1621        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1622        assert!(matches!(lexer.next_token(), Some((Token::LogicAnd, _))));
1623        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "q"));
1624        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1625        assert!(matches!(lexer.next_token(), Some((Token::LogicOr, _))));
1626    }
1627
1628    #[test]
1629    fn test_analysis_operators() {
1630        let mut lexer = Lexer::new("∫f ∂g √x ∛y f∘g");
1631        assert!(matches!(lexer.next_token(), Some((Token::Integral, _))));
1632        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1633        assert!(matches!(lexer.next_token(), Some((Token::Partial, _))));
1634        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "g"));
1635        assert!(matches!(lexer.next_token(), Some((Token::Sqrt, _))));
1636    }
1637
1638    #[test]
1639    fn test_additional_morphemes() {
1640        let mut lexer = Lexer::new("δ ε ω α ζ");
1641        assert!(matches!(lexer.next_token(), Some((Token::Delta, _))));
1642        assert!(matches!(lexer.next_token(), Some((Token::Epsilon, _))));
1643        assert!(matches!(lexer.next_token(), Some((Token::Omega, _))));
1644        assert!(matches!(lexer.next_token(), Some((Token::Alpha, _))));
1645        assert!(matches!(lexer.next_token(), Some((Token::Zeta, _))));
1646    }
1647
1648    #[test]
1649    fn test_ffi_keywords() {
1650        let mut lexer = Lexer::new("extern unsafe");
1651        assert!(matches!(lexer.next_token(), Some((Token::Extern, _))));
1652        assert!(matches!(lexer.next_token(), Some((Token::Unsafe, _))));
1653    }
1654
1655    #[test]
1656    fn test_parallel_morphemes() {
1657        // Note: ⊛ is Token::Convolve (convolution operator), not Gpu
1658        // gpu (keyword) produces Token::Gpu
1659        let mut lexer = Lexer::new("∥ parallel ⊛ gpu");
1660        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1661        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1662        assert!(matches!(lexer.next_token(), Some((Token::Convolve, _))));  // ⊛ = Convolve
1663        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));       // gpu = Gpu
1664    }
1665
1666    #[test]
1667    fn test_lifetime_labels() {
1668        // Test loop labels
1669        let mut lexer = Lexer::new("'outer: loop { break 'outer }");
1670        assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1671        assert!(matches!(lexer.next_token(), Some((Token::Colon, _))));
1672        assert!(matches!(lexer.next_token(), Some((Token::Loop, _))));
1673        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1674        assert!(matches!(lexer.next_token(), Some((Token::DeprecatedRustKeyword(s), _)) if s == "break"));
1675        assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1676        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1677    }
1678
1679    // ==================== STRING LITERAL TESTS ====================
1680
1681    #[test]
1682    fn test_string_escape_sequences() {
1683        // Test basic escape sequences
1684        let mut lexer = Lexer::new(r#""hello\nworld""#);
1685        match lexer.next_token() {
1686            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\nworld"),
1687            other => panic!("Expected StringLit, got {:?}", other),
1688        }
1689
1690        // Test tab escape
1691        let mut lexer = Lexer::new(r#""hello\tworld""#);
1692        match lexer.next_token() {
1693            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\tworld"),
1694            other => panic!("Expected StringLit, got {:?}", other),
1695        }
1696
1697        // Test carriage return
1698        let mut lexer = Lexer::new(r#""hello\rworld""#);
1699        match lexer.next_token() {
1700            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\rworld"),
1701            other => panic!("Expected StringLit, got {:?}", other),
1702        }
1703
1704        // Test escaped backslash
1705        let mut lexer = Lexer::new(r#""hello\\world""#);
1706        match lexer.next_token() {
1707            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\\world"),
1708            other => panic!("Expected StringLit, got {:?}", other),
1709        }
1710
1711        // Test escaped quote
1712        let mut lexer = Lexer::new(r#""hello\"world""#);
1713        match lexer.next_token() {
1714            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\"world"),
1715            other => panic!("Expected StringLit, got {:?}", other),
1716        }
1717
1718        // Test null character
1719        let mut lexer = Lexer::new(r#""hello\0world""#);
1720        match lexer.next_token() {
1721            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\0world"),
1722            other => panic!("Expected StringLit, got {:?}", other),
1723        }
1724    }
1725
1726    #[test]
1727    fn test_string_hex_escape() {
1728        // Test \xNN hex escape
1729        let mut lexer = Lexer::new(r#""hello\x41world""#);
1730        match lexer.next_token() {
1731            Some((Token::StringLit(s), _)) => assert_eq!(s, "helloAworld"),
1732            other => panic!("Expected StringLit, got {:?}", other),
1733        }
1734    }
1735
1736    #[test]
1737    fn test_string_unicode_escape() {
1738        // Test \u{NNNN} Unicode escape
1739        let mut lexer = Lexer::new(r#""hello\u{1F600}world""#);
1740        match lexer.next_token() {
1741            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello😀world"),
1742            other => panic!("Expected StringLit, got {:?}", other),
1743        }
1744
1745        // Test Greek letter
1746        let mut lexer = Lexer::new(r#""\u{03C4}""#);
1747        match lexer.next_token() {
1748            Some((Token::StringLit(s), _)) => assert_eq!(s, "τ"),
1749            other => panic!("Expected StringLit, got {:?}", other),
1750        }
1751    }
1752
1753    #[test]
1754    fn test_char_escape_sequences() {
1755        let mut lexer = Lexer::new(r"'\n'");
1756        match lexer.next_token() {
1757            Some((Token::CharLit(c), _)) => assert_eq!(c, '\n'),
1758            other => panic!("Expected CharLit, got {:?}", other),
1759        }
1760
1761        let mut lexer = Lexer::new(r"'\t'");
1762        match lexer.next_token() {
1763            Some((Token::CharLit(c), _)) => assert_eq!(c, '\t'),
1764            other => panic!("Expected CharLit, got {:?}", other),
1765        }
1766
1767        let mut lexer = Lexer::new(r"'\\'");
1768        match lexer.next_token() {
1769            Some((Token::CharLit(c), _)) => assert_eq!(c, '\\'),
1770            other => panic!("Expected CharLit, got {:?}", other),
1771        }
1772    }
1773
1774    #[test]
1775    fn test_raw_string() {
1776        // Raw strings should NOT process escapes
1777        let mut lexer = Lexer::new(r#"r"hello\nworld""#);
1778        match lexer.next_token() {
1779            Some((Token::RawStringLit(s), _)) => assert_eq!(s, r"hello\nworld"),
1780            other => panic!("Expected RawStringLit, got {:?}", other),
1781        }
1782    }
1783
1784    #[test]
1785    fn test_raw_string_delimited() {
1786        // r#"..."# style
1787        let mut lexer = Lexer::new(r##"r#"hello "world""#"##);
1788        match lexer.next_token() {
1789            Some((Token::RawStringDelimited(s), _)) => assert_eq!(s, r#"hello "world""#),
1790            other => panic!("Expected RawStringDelimited, got {:?}", other),
1791        }
1792    }
1793
1794    #[test]
1795    fn test_byte_string() {
1796        let mut lexer = Lexer::new(r#"b"hello""#);
1797        match lexer.next_token() {
1798            Some((Token::ByteStringLit(bytes), _)) => {
1799                assert_eq!(bytes, vec![104, 101, 108, 108, 111]); // "hello" in ASCII
1800            }
1801            other => panic!("Expected ByteStringLit, got {:?}", other),
1802        }
1803    }
1804
1805    #[test]
1806    fn test_interpolated_string() {
1807        let mut lexer = Lexer::new(r#"f"hello {name}""#);
1808        match lexer.next_token() {
1809            Some((Token::InterpolatedStringLit(s), _)) => assert_eq!(s, "hello {name}"),
1810            other => panic!("Expected InterpolatedStringLit, got {:?}", other),
1811        }
1812    }
1813
1814    #[test]
1815    fn test_sigil_string_sql() {
1816        let mut lexer = Lexer::new(r#"σ"SELECT * FROM {table}""#);
1817        match lexer.next_token() {
1818            Some((Token::SigilStringSql(s), _)) => assert_eq!(s, "SELECT * FROM {table}"),
1819            other => panic!("Expected SigilStringSql, got {:?}", other),
1820        }
1821    }
1822
1823    #[test]
1824    fn test_sigil_string_route() {
1825        let mut lexer = Lexer::new(r#"ρ"/api/v1/{resource}/{id}""#);
1826        match lexer.next_token() {
1827            Some((Token::SigilStringRoute(s), _)) => assert_eq!(s, "/api/v1/{resource}/{id}"),
1828            other => panic!("Expected SigilStringRoute, got {:?}", other),
1829        }
1830    }
1831
1832    #[test]
1833    fn test_unicode_in_strings() {
1834        // Test direct Unicode in strings
1835        let mut lexer = Lexer::new(r#""τφσρ 你好 🦀""#);
1836        match lexer.next_token() {
1837            Some((Token::StringLit(s), _)) => assert_eq!(s, "τφσρ 你好 🦀"),
1838            other => panic!("Expected StringLit, got {:?}", other),
1839        }
1840    }
1841
1842    #[test]
1843    fn test_empty_string() {
1844        let mut lexer = Lexer::new(r#""""#);
1845        match lexer.next_token() {
1846            Some((Token::StringLit(s), _)) => assert_eq!(s, ""),
1847            other => panic!("Expected empty StringLit, got {:?}", other),
1848        }
1849    }
1850
1851    #[test]
1852    fn test_escape_sequence_helper() {
1853        // Unit test the helper function directly
1854        assert_eq!(process_escape_sequences(r"hello\nworld"), "hello\nworld");
1855        assert_eq!(process_escape_sequences(r"hello\tworld"), "hello\tworld");
1856        assert_eq!(process_escape_sequences(r"hello\\world"), "hello\\world");
1857        assert_eq!(process_escape_sequences(r#"hello\"world"#), "hello\"world");
1858        assert_eq!(process_escape_sequences(r"hello\x41world"), "helloAworld");
1859        assert_eq!(
1860            process_escape_sequences(r"hello\u{1F600}world"),
1861            "hello😀world"
1862        );
1863    }
1864}
sigil_parser/lexer.rs

sigil_parser/
lexer.rs