sigil_parser/
lexer.rs

1//! Lexer for the Sigil programming language.
2//!
3//! Handles polysynthetic morphemes, evidentiality markers, and multi-base numerals.
4
5use crate::span::Span;
6use logos::Logos;
7
8/// Process escape sequences in a string literal.
9/// Converts \n, \t, \r, \\, \", \', \0, \xNN, \u{NNNN} to their actual characters.
10/// Also handles line continuation: \<newline><whitespace> is stripped entirely.
11fn process_escape_sequences(s: &str) -> String {
12    let mut result = String::with_capacity(s.len());
13    let mut chars = s.chars().peekable();
14
15    while let Some(c) = chars.next() {
16        if c == '\\' {
17            match chars.next() {
18                Some('\n') => {
19                    // Line continuation: skip newline and any leading whitespace
20                    while let Some(&c) = chars.peek() {
21                        if c == ' ' || c == '\t' {
22                            chars.next();
23                        } else {
24                            break;
25                        }
26                    }
27                }
28                Some('n') => result.push('\n'),
29                Some('t') => result.push('\t'),
30                Some('r') => result.push('\r'),
31                Some('\\') => result.push('\\'),
32                Some('"') => result.push('"'),
33                Some('\'') => result.push('\''),
34                Some('0') => result.push('\0'),
35                Some('x') => {
36                    // \xNN - two hex digits
37                    let mut hex = String::new();
38                    for _ in 0..2 {
39                        if let Some(&c) = chars.peek() {
40                            if c.is_ascii_hexdigit() {
41                                hex.push(chars.next().unwrap());
42                            }
43                        }
44                    }
45                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
46                        result.push(val as char);
47                    }
48                }
49                Some('u') => {
50                    // \u{NNNN} - Unicode code point
51                    if chars.peek() == Some(&'{') {
52                        chars.next(); // consume '{'
53                        let mut hex = String::new();
54                        while let Some(&c) = chars.peek() {
55                            if c == '}' {
56                                chars.next();
57                                break;
58                            }
59                            if c.is_ascii_hexdigit() {
60                                hex.push(chars.next().unwrap());
61                            } else {
62                                break;
63                            }
64                        }
65                        if let Ok(val) = u32::from_str_radix(&hex, 16) {
66                            if let Some(c) = char::from_u32(val) {
67                                result.push(c);
68                            }
69                        }
70                    }
71                }
72                Some(other) => {
73                    // Unknown escape, keep as-is
74                    result.push('\\');
75                    result.push(other);
76                }
77                None => result.push('\\'),
78            }
79        } else {
80            result.push(c);
81        }
82    }
83    result
84}
85
86/// Process escape sequences in byte string literals, returning bytes.
87fn process_byte_escape_sequences(s: &str) -> Vec<u8> {
88    let mut result = Vec::with_capacity(s.len());
89    let mut chars = s.chars().peekable();
90
91    while let Some(c) = chars.next() {
92        if c == '\\' {
93            match chars.next() {
94                Some('\n') => {
95                    // Line continuation: skip newline and any leading whitespace
96                    while let Some(&c) = chars.peek() {
97                        if c == ' ' || c == '\t' {
98                            chars.next();
99                        } else {
100                            break;
101                        }
102                    }
103                }
104                Some('n') => result.push(b'\n'),
105                Some('t') => result.push(b'\t'),
106                Some('r') => result.push(b'\r'),
107                Some('\\') => result.push(b'\\'),
108                Some('"') => result.push(b'"'),
109                Some('\'') => result.push(b'\''),
110                Some('0') => result.push(0),
111                Some('x') => {
112                    // \xNN - two hex digits
113                    let mut hex = String::new();
114                    for _ in 0..2 {
115                        if let Some(&c) = chars.peek() {
116                            if c.is_ascii_hexdigit() {
117                                hex.push(chars.next().unwrap());
118                            }
119                        }
120                    }
121                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
122                        result.push(val);
123                    }
124                }
125                Some(other) => {
126                    // Unknown escape, keep as-is
127                    result.push(b'\\');
128                    if other.is_ascii() {
129                        result.push(other as u8);
130                    }
131                }
132                None => result.push(b'\\'),
133            }
134        } else if c.is_ascii() {
135            result.push(c as u8);
136        }
137        // Non-ASCII in byte strings is ignored (Rust doesn't allow it)
138    }
139    result
140}
141
142/// Callback for delimited raw strings (r#"..."#).
143/// Reads until the closing "# is found.
144/// Callback for block comments: /* ... */
145/// Consumes characters until */ is found
146fn block_comment_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
147    let remainder = lex.remainder();
148
149    // Find the closing */
150    if let Some(end_pos) = remainder.find("*/") {
151        let content = &remainder[..end_pos];
152        // Bump past content and closing */ (2 chars)
153        lex.bump(end_pos + 2);
154        Some(content.to_string())
155    } else {
156        // No closing */ found - consume rest as comment
157        let len = remainder.len();
158        lex.bump(len);
159        Some(remainder.to_string())
160    }
161}
162
163fn raw_string_delimited_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
164    let remainder = lex.remainder();
165
166    // Find the closing "#
167    if let Some(end_pos) = remainder.find("\"#") {
168        let content = &remainder[..end_pos];
169        // Bump past content and closing "# (2 chars)
170        lex.bump(end_pos + 2);
171        Some(content.to_string())
172    } else {
173        None
174    }
175}
176
177/// Callback for multi-line string literals.
178/// Reads from """ until the next """ is found.
179fn multiline_string_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
180    let remainder = lex.remainder();
181
182    // Find the closing """
183    if let Some(end_pos) = remainder.find("\"\"\"") {
184        let content = &remainder[..end_pos];
185        // Bump the lexer past the content and closing quotes
186        lex.bump(end_pos + 3);
187        Some(process_escape_sequences(content))
188    } else {
189        // No closing """ found - skip to end and return what we have
190        None
191    }
192}
193
194/// Process escape sequences in a character literal.
195fn process_char_escape(s: &str) -> char {
196    let mut chars = s.chars();
197    match chars.next() {
198        Some('\\') => match chars.next() {
199            Some('n') => '\n',
200            Some('t') => '\t',
201            Some('r') => '\r',
202            Some('\\') => '\\',
203            Some('"') => '"',
204            Some('\'') => '\'',
205            Some('0') => '\0',
206            Some('x') => {
207                let hex: String = chars.take(2).collect();
208                u8::from_str_radix(&hex, 16)
209                    .map(|v| v as char)
210                    .unwrap_or('?')
211            }
212            Some('u') => {
213                if chars.next() == Some('{') {
214                    let hex: String = chars.take_while(|&c| c != '}').collect();
215                    u32::from_str_radix(&hex, 16)
216                        .ok()
217                        .and_then(char::from_u32)
218                        .unwrap_or('?')
219                } else {
220                    '?'
221                }
222            }
223            Some(c) => c,
224            None => '?',
225        },
226        Some(c) => c,
227        None => '?',
228    }
229}
230
231/// Process escape sequences in a byte character literal (b'x').
232fn process_byte_char_escape(s: &str) -> u8 {
233    let mut chars = s.chars();
234    match chars.next() {
235        Some('\\') => match chars.next() {
236            Some('n') => b'\n',
237            Some('t') => b'\t',
238            Some('r') => b'\r',
239            Some('\\') => b'\\',
240            Some('"') => b'"',
241            Some('\'') => b'\'',
242            Some('0') => b'\0',
243            Some('x') => {
244                let hex: String = chars.take(2).collect();
245                u8::from_str_radix(&hex, 16).unwrap_or(b'?')
246            }
247            Some(c) => c as u8,
248            None => b'?',
249        },
250        Some(c) => c as u8,
251        None => b'?',
252    }
253}
254
255/// Token types for Sigil.
256#[derive(Logos, Debug, Clone, PartialEq)]
257#[logos(skip r"[ \t\r\n\f]+")]
258pub enum Token {
259    // === Comments ===
260    #[regex(r"//[^\n]*", |lex| lex.slice().to_string())]
261    LineComment(String),
262
263    #[regex(r"//![^\n]*", |lex| lex.slice().to_string())]
264    DocComment(String),
265
266    // Tilde comment style: ~~ ... ~~
267    #[regex(r"~~[^\n]*", |lex| lex.slice().to_string())]
268    TildeComment(String),
269
270    // Block comment: /* ... */ (non-nested)
271    #[token("/*", block_comment_callback)]
272    BlockComment(String),
273
274    // === Keywords ===
275    #[token("fn")]
276    Fn,
277    #[token("async")]
278    Async,
279    #[token("let")]
280    Let,
281    #[token("mut")]
282    Mut,
283    #[token("const")]
284    Const,
285    #[token("type")]
286    Type,
287    #[token("struct")]
288    #[token("sigil")]  // Alternative syntax for struct
289    Struct,
290    #[token("enum")]
291    Enum,
292    #[token("trait")]
293    Trait,
294    #[token("impl")]
295    Impl,
296    #[token("mod")]
297    #[token("scroll")]  // Sigil-native: scroll = mod
298    Mod,
299    #[token("use")]
300    #[token("invoke")]  // Sigil-native: invoke = use
301    Use,
302    #[token("pub")]
303    Pub,
304    #[token("actor")]
305    Actor,
306    #[token("saga")]
307    Saga,
308    #[token("scope")]
309    Scope,
310    #[token("rune")]
311    Rune,
312    #[token("macro")]
313    Macro,
314    #[token("macro_rules")]
315    MacroRules,
316
317    // Control flow
318    #[token("if")]
319    If,
320    #[token("else")]
321    Else,
322    #[token("match")]
323    Match,
324    #[token("loop")]
325    Loop,
326    #[token("while")]
327    While,
328    #[token("for")]
329    For,
330    #[token("in")]
331    In,
332    #[token("break")]
333    Break,
334    #[token("continue")]
335    Continue,
336    #[token("return")]
337    Return,
338    #[token("yield")]
339    Yield,
340    #[token("await")]
341    Await,
342
343    // Other keywords
344    #[token("self")]
345    SelfLower,
346    #[token("Self")]
347    SelfUpper,
348    #[token("super")]
349    Super,
350    #[token("crate")]
351    #[token("tome")]  // Sigil-native: tome = crate
352    Crate,
353    #[token("where")]
354    Where,
355    #[token("as")]
356    As,
357    #[token("dyn")]
358    Dyn,
359    #[token("move")]
360    Move,
361    #[token("ref")]
362    Ref,
363    #[token("static")]
364    Static,
365    #[token("unsafe")]
366    Unsafe,
367    #[token("extern")]
368    Extern,
369    #[token("asm")]
370    Asm,
371    #[token("volatile")]
372    Volatile,
373    #[token("naked")]
374    Naked,
375    #[token("packed")]
376    Packed,
377    #[token("simd")]
378    Simd,
379    #[token("atomic")]
380    Atomic,
381    #[token("derive")]
382    Derive,
383    #[token("on")]
384    On,
385
386    // Plurality keywords (DAEMONIORUM extensions)
387    #[token("alter")]
388    Alter,
389    #[token("switch")]
390    Switch,
391    #[token("headspace")]
392    Headspace,
393    #[token("cocon")]
394    CoCon,
395    #[token("reality")]
396    Reality,
397    #[token("split")]
398    Split,
399    #[token("trigger")]
400    Trigger,
401    #[token("layer")]
402    Layer,
403    #[token("location")]
404    Location,
405    #[token("states")]
406    States,
407    #[token("anima")]
408    Anima,
409    #[token("to")]
410    To,
411    #[token("from")]
412    From,
413
414    // Alter-source markers (compound tokens)
415    #[token("@!")]
416    AlterSourceFronting,
417    #[token("@~")]
418    AlterSourceCoCon,
419    #[token("@?")]
420    AlterSourceDormant,
421    #[token("@‽")]
422    AlterSourceBlended,
423
424    // Boolean literals
425    #[token("true")]
426    True,
427    #[token("false")]
428    False,
429
430    // Null literal
431    #[token("null")]
432    Null,
433
434    // === Morphemes (Greek letters) ===
435    #[token("τ")]
436    #[token("Τ")]
437    Tau, // Transform/map
438
439    #[token("φ")]
440    #[token("Φ")]
441    Phi, // Filter
442
443    #[token("σ")]
444    #[token("Σ")]
445    Sigma, // Sort (lowercase) / Sum (uppercase)
446
447    #[token("ρ")]
448    #[token("Ρ")]
449    Rho, // Reduce
450
451    #[token("λ")]
452    #[token("Λ")]
453    Lambda, // Lambda
454
455    #[token("Π")]
456    Pi, // Product
457
458    #[token("⌛")]
459    Hourglass, // Await symbol
460
461    // Additional morphemes
462    #[token("δ")]
463    #[token("Δ")]
464    Delta, // Difference/change
465
466    #[token("ε")]
467    Epsilon, // Empty/null
468
469    #[token("ω")]
470    #[token("Ω")]
471    Omega, // End/terminal
472
473    #[token("α")]
474    Alpha, // First element
475
476    #[token("ζ")]
477    Zeta, // Zip/combine
478
479    // === Additional Access Morphemes ===
480    #[token("μ")]
481    #[token("Μ")]
482    Mu, // Middle/median element
483
484    #[token("χ")]
485    #[token("Χ")]
486    Chi, // Random/choice (from chaos)
487
488    #[token("ν")]
489    #[token("Ν")]
490    Nu, // Nth element (ordinal)
491
492    #[token("ξ")]
493    #[token("Ξ")]
494    Xi, // Next in sequence
495
496    #[token("ψ")]
497    #[token("Ψ")]
498    Psi, // Psychological/mental state
499
500    #[token("θ")]
501    #[token("Θ")]
502    Theta, // Threshold/angle
503
504    #[token("κ")]
505    #[token("Κ")]
506    Kappa, // Callback/continuation
507
508    // === Parallel/Concurrency Morphemes ===
509    #[token("∥")]
510    #[token("parallel")]
511    Parallel, // Parallel execution (U+2225)
512
513    #[token("⊛")]
514    #[token("gpu")]
515    Gpu, // GPU compute shader (U+229B - circled asterisk)
516
517    // === Quantifiers (for AI-native set operations) ===
518    #[token("∀")]
519    ForAll, // Universal quantification
520
521    #[token("∃")]
522    Exists, // Existential quantification
523
524    #[token("∈")]
525    ElementOf, // Membership test
526
527    #[token("∉")]
528    NotElementOf, // Non-membership
529
530    // === Set Operations ===
531    #[token("∪")]
532    Union, // Set union
533
534    #[token("∩")]
535    Intersection, // Set intersection
536
537    #[token("∖")]
538    SetMinus, // Set difference
539
540    #[token("⊂")]
541    Subset, // Proper subset
542
543    #[token("⊆")]
544    SubsetEq, // Subset or equal
545
546    #[token("⊃")]
547    Superset, // Proper superset
548
549    #[token("⊇")]
550    SupersetEq, // Superset or equal
551
552    // === Logic Operators ===
553    #[token("∧")]
554    LogicAnd, // Logical conjunction
555
556    #[token("∨")]
557    LogicOr, // Logical disjunction
558
559    #[token("¬")]
560    LogicNot, // Logical negation
561
562    #[token("⊻")]
563    LogicXor, // Exclusive or
564
565    #[token("⊤")]
566    Top, // True/any type
567
568    #[token("⊥")]
569    Bottom, // False/never type
570
571    // === Bitwise Operators (Unicode) ===
572    #[token("⋏")]
573    BitwiseAndSymbol, // Bitwise AND (U+22CF)
574
575    #[token("⋎")]
576    BitwiseOrSymbol, // Bitwise OR (U+22CE)
577
578    #[token("⊙")]
579    CircledDot, // Hadamard product / element-wise multiply (U+2299)
580
581    // Note: ⊗ (tensor product) is already defined as Token::Tensor below
582
583    // === Type Theory ===
584    #[token("∷")]
585    TypeAnnotation, // Type annotation (alternative to :)
586
587    // === Analysis/Calculus ===
588    #[token("∫")]
589    Integral, // Cumulative sum
590
591    #[token("∂")]
592    Partial, // Discrete derivative
593
594    #[token("√")]
595    Sqrt, // Square root
596
597    #[token("∛")]
598    Cbrt, // Cube root
599
600    #[token("∇")]
601    Nabla, // Gradient (U+2207)
602
603    // === APL-Inspired Symbols ===
604    #[token("⍋")]
605    GradeUp, // Sort ascending (U+234B)
606
607    #[token("⍒")]
608    GradeDown, // Sort descending (U+2352)
609
610    #[token("⌽")]
611    Rotate, // Reverse/rotate (U+233D)
612
613    #[token("↻")]
614    CycleArrow, // Cycle/repeat (U+21BB)
615
616    #[token("⌺")]
617    QuadDiamond, // Windows/stencil (U+233A)
618
619    #[token("⊞")]
620    SquaredPlus, // Chunks (U+229E)
621
622    #[token("⍳")]
623    Iota, // Enumerate/index (U+2373)
624
625    // === Category Theory ===
626    #[token("∘")]
627    Compose, // Function composition
628
629    #[token("⊗")]
630    Tensor, // Tensor product
631
632    #[token("⊕")]
633    DirectSum, // Direct sum / XOR
634
635    // === Data Operations ===
636    #[token("⋈")]
637    Bowtie, // Join/zip combining (U+22C8)
638
639    #[token("⋳")]
640    ElementSmallVerticalBar, // Flatten (U+22F3)
641
642    #[token("⊔")]
643    SquareCup, // Lattice join / supremum (U+2294)
644
645    #[token("⊓")]
646    SquareCap, // Lattice meet / infimum (U+2293)
647
648    // === Evidentiality Markers ===
649    // Note: These are handled contextually since ! and ? have other uses
650    #[token("‽")]
651    Interrobang, // Paradox/trust boundary (U+203D)
652
653    #[token("◊")]
654    Lozenge, // Predicted/speculative (U+25CA) - Token◊
655
656    // === Legion Morphemes (Holographic Agent Collective) ===
657    // From Infernum 2.0 - distributed memory and multi-agent coordination
658
659    #[token("∿")]
660    #[token("legion_field")]
661    LegionField, // Collective memory substrate (U+223F sine wave) - memory∿
662
663    #[token("⫰")]
664    #[token("interfere")]
665    Interfere, // Interference query (U+2AF0) - query ⫰ field∿
666
667    #[token("⟁")]
668    #[token("distribute")]
669    Distribute, // Holographic distribution (U+27C1) - task ⟁ 8
670
671    #[token("⟀")]
672    #[token("gather")]
673    Gather, // Interference gathering (U+27C0) - fragments ⟀
674
675    #[token("↠")]
676    #[token("broadcast")]
677    Broadcast, // One-to-many broadcast (U+21A0) - signal ↠ legion
678
679    #[token("⇢")]
680    #[token("consensus")]
681    Consensus, // Many-to-one consensus (U+21E2) - contributions ⇢
682
683    // Compound Legion operators
684    #[token("⊕=")]
685    DirectSumEq, // Superposition assign - field∿ ⊕= pattern
686
687    #[token("∂=")]
688    PartialEq_, // Decay assign - field∿ ∂= 0.95 (renamed to avoid std conflict)
689
690    #[token("⫰=")]
691    InterfereEq, // Interference assign
692
693    // === Affective Markers (Sentiment & Emotion) ===
694    // Sentiment polarity
695    #[token("⊖")]
696    AffectNegative, // Negative sentiment (U+2296 Circled Minus)
697
698    #[token("⊜")]
699    AffectNeutral, // Neutral sentiment (U+229C Circled Equals)
700
701    // Note: ⊕ (U+2295) is already DirectSum - we'll use it dual-purpose for positive sentiment
702
703    // Sarcasm/Irony
704    #[token("⸮")]
705    IronyMark, // Irony/sarcasm marker (U+2E2E - historical percontation point!)
706
707    // Intensity modifiers
708    #[token("↑")]
709    IntensityUp, // Intensifier (U+2191)
710
711    #[token("↓")]
712    IntensityDown, // Dampener (U+2193)
713
714    #[token("⇈")]
715    IntensityMax, // Maximum intensity (U+21C8)
716
717    // Formality register
718    #[token("♔")]
719    FormalRegister, // Formal (U+2654 White King)
720
721    #[token("♟")]
722    InformalRegister, // Informal (U+265F Black Pawn)
723
724    // Emotion markers (Plutchik's wheel)
725    #[token("☺")]
726    EmotionJoy, // Joy (U+263A)
727
728    #[token("☹")]
729    EmotionSadness, // Sadness (U+2639)
730
731    #[token("⚡")]
732    EmotionAnger, // Anger (U+26A1)
733
734    #[token("❄")]
735    EmotionFear, // Fear (U+2744)
736
737    #[token("✦")]
738    EmotionSurprise, // Surprise (U+2726)
739
740    #[token("♡")]
741    EmotionLove, // Love/Trust (U+2661)
742
743    // Confidence markers
744    #[token("◉")]
745    ConfidenceHigh, // High confidence (U+25C9)
746
747    #[token("◎")]
748    ConfidenceMedium, // Medium confidence (U+25CE)
749
750    #[token("○")]
751    ConfidenceLow, // Low confidence (U+25CB)
752
753    // === Aspect Morphemes (verb aspects) ===
754    #[token("·ing")]
755    AspectProgressive, // Ongoing/streaming aspect
756
757    #[token("·ed")]
758    AspectPerfective, // Completed aspect
759
760    #[token("·able")]
761    AspectPotential, // Capability aspect
762
763    #[token("·ive")]
764    AspectResultative, // Result-producing aspect
765
766    // === Operators ===
767    #[token("|")]
768    Pipe,
769    #[token("·")]
770    MiddleDot, // Incorporation
771    #[token("->")]
772    Arrow,
773    #[token("=>")]
774    FatArrow,
775    #[token("<-")]
776    LeftArrow,
777    #[token("==")]
778    EqEq,
779    #[token("!=")]
780    NotEq,
781    #[token("<=")]
782    LtEq,
783    #[token(">=")]
784    GtEq,
785    #[token("<")]
786    Lt,
787    #[token(">")]
788    Gt,
789    #[token("+")]
790    Plus,
791    #[token("-")]
792    Minus,
793    #[token("*")]
794    Star,
795    #[token("/")]
796    Slash,
797    #[token("%")]
798    Percent,
799    #[token("**")]
800    StarStar, // Exponentiation
801    #[token("&&")]
802    AndAnd,
803    #[token("||")]
804    OrOr,
805    #[token("!")]
806    Bang, // Evidentiality: known / logical not
807    #[token("?")]
808    Question, // Evidentiality: uncertain / try
809    #[token("~")]
810    Tilde, // Evidentiality: reported
811    #[token("&")]
812    Amp,
813    #[token("^")]
814    Caret,
815    #[token("<<=")]
816    ShlEq,
817    #[token(">>=")]
818    ShrEq,
819    #[token("<<")]
820    Shl,
821    #[token(">>")]
822    Shr,
823    #[token("=")]
824    Eq,
825    #[token("+=")]
826    PlusEq,
827    #[token("-=")]
828    MinusEq,
829    #[token("*=")]
830    StarEq,
831    #[token("/=")]
832    SlashEq,
833    #[token("%=")]
834    PercentEq,
835    #[token("|=")]
836    PipeEq,
837    #[token("&=")]
838    AmpEq,
839    #[token("^=")]
840    CaretEq,
841    #[token("..")]
842    DotDot,
843    #[token("..=")]
844    DotDotEq,
845    #[token("++")]
846    PlusPlus, // Concatenation
847    #[token("::")]
848    ColonColon,
849    #[token(":")]
850    Colon,
851    #[token(";")]
852    Semi,
853    #[token(",")]
854    Comma,
855    #[token(".")]
856    Dot,
857    #[token("@")]
858    At,
859    #[token("#!")]
860    HashBang, // Inner attribute prefix #![...]
861    #[token("#")]
862    Hash,
863    #[token("_", priority = 3)]
864    Underscore,
865
866    // === Delimiters ===
867    #[token("(")]
868    LParen,
869    #[token(")")]
870    RParen,
871    #[token("{")]
872    LBrace,
873    #[token("}")]
874    RBrace,
875    #[token("[")]
876    LBracket,
877    #[token("]")]
878    RBracket,
879
880    // === Special symbols ===
881    #[token("∅")]
882    Empty, // Void/emptiness (śūnya)
883    #[token("◯")]
884    Circle, // Geometric zero
885    #[token("∞")]
886    Infinity, // Ananta
887
888    // === Protocol Operations (Sigil-native networking) ===
889    #[token("⇒")]
890    ProtoSend, // Send data (U+21D2 - rightwards double arrow)
891
892    #[token("⇐")]
893    ProtoRecv, // Receive data (U+21D0 - leftwards double arrow)
894
895    #[token("≋")]
896    ProtoStream, // Stream data (U+224B - triple tilde)
897
898    #[token("⊸")]
899    ProtoConnect, // Connect/lollipop (U+22B8 - multimap)
900
901    #[token("⏱")]
902    ProtoTimeout, // Timeout (U+23F1 - stopwatch)
903
904    // Note: ⊗ (Tensor) is used for close in protocol contexts
905
906    // Protocol keywords for ASCII fallback
907    #[token("send")]
908    Send,
909    #[token("recv")]
910    Recv,
911    #[token("stream")]
912    Stream,
913    #[token("connect")]
914    Connect,
915    #[token("close")]
916    Close,
917    #[token("timeout")]
918    Timeout,
919    #[token("retry")]
920    Retry,
921    #[token("header")]
922    Header,
923    #[token("body")]
924    Body,
925
926    // Protocol type identifiers (for incorporation: http·, ws·, grpc·, kafka·)
927    #[token("http")]
928    Http,
929    #[token("https")]
930    Https,
931    #[token("ws")]
932    Ws,
933    #[token("wss")]
934    Wss,
935    #[token("grpc")]
936    Grpc,
937    #[token("kafka")]
938    Kafka,
939    #[token("amqp")]
940    Amqp,
941    #[token("graphql")]
942    GraphQL,
943
944    // === Numbers ===
945    // Binary: 0b... with optional type suffix
946    #[regex(r"0b[01_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
947    BinaryLit(String),
948
949    // Octal: 0o... with optional type suffix
950    #[regex(r"0o[0-7_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
951    OctalLit(String),
952
953    // Hex: 0x... with optional type suffix
954    #[regex(r"0x[0-9a-fA-F_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
955    HexLit(String),
956
957    // Vigesimal: 0v... (base 20)
958    #[regex(r"0v[0-9a-jA-J_]+", |lex| lex.slice().to_string())]
959    VigesimalLit(String),
960
961    // Sexagesimal: 0s... (base 60)
962    #[regex(r"0s[0-9a-zA-Z_]+", |lex| lex.slice().to_string())]
963    SexagesimalLit(String),
964
965    // Duodecimal: 0z... (base 12)
966    #[regex(r"0z[0-9a-bA-B_]+", |lex| lex.slice().to_string())]
967    DuodecimalLit(String),
968
969    // Float: 123.456 or 1.23e10 or 1e-15 (with or without decimal point if exponent present)
970    // Optional type suffix: f16, f32, f64, f128
971    #[regex(r"([0-9][0-9_]*\.[0-9][0-9_]*([eE][+-]?[0-9_]+)?|[0-9][0-9_]*[eE][+-]?[0-9_]+)(f16|f32|f64|f128)?", |lex| lex.slice().to_string())]
972    FloatLit(String),
973
974    // Integer: 123 with optional type suffix (i8, i16, i32, i64, i128, isize, u8, u16, u32, u64, u128, usize)
975    #[regex(r"[0-9][0-9_]*(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
976    IntLit(String),
977
978    // === Strings ===
979    // Regular string with escape sequence processing
980    // Note: \\(.|\n) handles both regular escapes and line continuation (\ at end of line)
981    #[regex(r#""([^"\\]|\\(.|\n))*""#, |lex| {
982        let s = lex.slice();
983        let inner = &s[1..s.len()-1];
984        process_escape_sequences(inner)
985    })]
986    StringLit(String),
987
988    // Multi-line string (triple-quoted) - handled via callback
989    #[token(r#"""""#, multiline_string_callback)]
990    MultiLineStringLit(String),
991
992    // Byte string literal
993    #[regex(r#"b"([^"\\]|\\.)*""#, |lex| {
994        let s = lex.slice();
995        let inner = &s[2..s.len()-1];
996        process_byte_escape_sequences(inner)
997    })]
998    ByteStringLit(Vec<u8>),
999
1000    // Interpolated string (will be parsed further for expressions)
1001    #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
1002        let s = lex.slice();
1003        let inner = &s[2..s.len()-1];
1004        process_escape_sequences(inner)
1005    })]
1006    InterpolatedStringLit(String),
1007
1008    // Sigil string - SQL template (σ prefix)
1009    #[regex(r#"σ"([^"\\]|\\.)*""#, |lex| {
1010        let s = lex.slice();
1011        // Get byte index after the σ character (which is 2 bytes in UTF-8)
1012        let start = "σ".len() + 1; // σ + opening quote
1013        let inner = &s[start..s.len()-1];
1014        process_escape_sequences(inner)
1015    })]
1016    SigilStringSql(String),
1017
1018    // Sigil string - Route template (ρ prefix)
1019    #[regex(r#"ρ"([^"\\]|\\.)*""#, |lex| {
1020        let s = lex.slice();
1021        // Get byte index after the ρ character (which is 2 bytes in UTF-8)
1022        let start = "ρ".len() + 1; // ρ + opening quote
1023        let inner = &s[start..s.len()-1];
1024        process_escape_sequences(inner)
1025    })]
1026    SigilStringRoute(String),
1027
1028    // Char literal with escape sequence processing
1029    // Matches: single char, hex escape \xNN, unicode escape \u{N...}, or simple escape \c
1030    #[regex(r"'([^'\\]|\\x[0-9a-fA-F]{2}|\\u\{[0-9a-fA-F]{1,6}\}|\\.)'", |lex| {
1031        let s = lex.slice();
1032        let inner = &s[1..s.len()-1];
1033        process_char_escape(inner)
1034    })]
1035    CharLit(char),
1036
1037    // Byte char literal (b'x' or b'\n')
1038    #[regex(r"b'([^'\\]|\\x[0-9a-fA-F]{2}|\\.)'", |lex| {
1039        let s = lex.slice();
1040        // Extract the character between b' and '
1041        let inner = &s[2..s.len()-1];
1042        process_byte_char_escape(inner)
1043    })]
1044    ByteCharLit(u8),
1045
1046    // Raw string (no escape processing, but allows \" for literal quotes in patterns)
1047    #[regex(r#"r"([^"\\]|\\.)*""#, |lex| {
1048        let s = lex.slice();
1049        s[2..s.len()-1].to_string()
1050    })]
1051    RawStringLit(String),
1052
1053    // Raw string with delimiter (r#"..."# style) - handles internal quotes
1054    #[token(r##"r#""##, raw_string_delimited_callback)]
1055    RawStringDelimited(String),
1056
1057    // === Lifetime/Label (for loop labels like 'outer: loop { break 'outer }) ===
1058    #[regex(r"'[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice()[1..].to_string())]
1059    Lifetime(String),
1060
1061    // === Identifiers ===
1062    // Includes Greek letters for polysynthetic identifiers like compute_ψ_state
1063    // Greek letters (both cases): αΑ, βΒ, γΓ, δΔ, εΕ, ζΖ, ηΗ, θΘ, ιΙ, κΚ, λΛ, μΜ, νΝ, ξΞ, οΟ, πΠ, ρΡ, σΣ, τΤ, υΥ, φΦ, χΧ, ψΨ, ωΩ
1064    #[regex(r"[a-zA-Z_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ][a-zA-Z0-9_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]*", |lex| lex.slice().to_string())]
1065    Ident(String),
1066
1067    // === Rune annotation ===
1068    #[regex(r"//@\s*rune:\s*[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
1069    RuneAnnotation(String),
1070}
1071
1072impl Token {
1073    pub fn is_keyword(&self) -> bool {
1074        matches!(
1075            self,
1076            Token::Fn
1077                | Token::Async
1078                | Token::Let
1079                | Token::Mut
1080                | Token::Const
1081                | Token::Type
1082                | Token::Struct
1083                | Token::Enum
1084                | Token::Trait
1085                | Token::Impl
1086                | Token::Mod
1087                | Token::Use
1088                | Token::Pub
1089                | Token::Actor
1090                | Token::Saga
1091                | Token::Scope
1092                | Token::Rune
1093                | Token::If
1094                | Token::Else
1095                | Token::Match
1096                | Token::Loop
1097                | Token::While
1098                | Token::For
1099                | Token::In
1100                | Token::Break
1101                | Token::Continue
1102                | Token::Return
1103                | Token::Yield
1104                | Token::Await
1105        ) || self.is_plurality_keyword()
1106    }
1107
1108    pub fn is_plurality_keyword(&self) -> bool {
1109        matches!(
1110            self,
1111            Token::Alter
1112                | Token::Switch
1113                | Token::Headspace
1114                | Token::CoCon
1115                | Token::Reality
1116                | Token::Split
1117                | Token::Trigger
1118                | Token::Layer
1119                | Token::Location
1120                | Token::States
1121                | Token::Anima
1122                | Token::To
1123                | Token::From
1124        )
1125    }
1126
1127    pub fn is_alter_source(&self) -> bool {
1128        matches!(
1129            self,
1130            Token::AlterSourceFronting
1131                | Token::AlterSourceCoCon
1132                | Token::AlterSourceDormant
1133                | Token::AlterSourceBlended
1134        )
1135    }
1136
1137    pub fn is_morpheme(&self) -> bool {
1138        matches!(
1139            self,
1140            Token::Tau | Token::Phi | Token::Sigma | Token::Rho |
1141            Token::Lambda | Token::Pi | Token::Hourglass |
1142            Token::Delta | Token::Epsilon | Token::Omega | Token::Alpha | Token::Zeta |
1143            Token::Mu | Token::Chi | Token::Nu | Token::Xi |  // Access morphemes
1144            Token::Parallel | Token::Gpu |  // Concurrency morphemes
1145            Token::Integral | Token::Partial | Token::Sqrt | Token::Cbrt |
1146            Token::Compose
1147        )
1148    }
1149
1150    pub fn is_aspect(&self) -> bool {
1151        matches!(
1152            self,
1153            Token::AspectProgressive
1154                | Token::AspectPerfective
1155                | Token::AspectPotential
1156                | Token::AspectResultative
1157        )
1158    }
1159
1160    pub fn is_data_op(&self) -> bool {
1161        matches!(
1162            self,
1163            Token::Bowtie | Token::ElementSmallVerticalBar | Token::SquareCup | Token::SquareCap
1164        )
1165    }
1166
1167    pub fn is_bitwise_symbol(&self) -> bool {
1168        matches!(self, Token::BitwiseAndSymbol | Token::BitwiseOrSymbol)
1169    }
1170
1171    pub fn is_quantifier(&self) -> bool {
1172        matches!(
1173            self,
1174            Token::ForAll | Token::Exists | Token::ElementOf | Token::NotElementOf
1175        )
1176    }
1177
1178    pub fn is_set_op(&self) -> bool {
1179        matches!(
1180            self,
1181            Token::Union
1182                | Token::Intersection
1183                | Token::SetMinus
1184                | Token::Subset
1185                | Token::SubsetEq
1186                | Token::Superset
1187                | Token::SupersetEq
1188        )
1189    }
1190
1191    pub fn is_logic_op(&self) -> bool {
1192        matches!(
1193            self,
1194            Token::LogicAnd
1195                | Token::LogicOr
1196                | Token::LogicNot
1197                | Token::LogicXor
1198                | Token::Top
1199                | Token::Bottom
1200        )
1201    }
1202
1203    pub fn is_evidentiality(&self) -> bool {
1204        matches!(
1205            self,
1206            Token::Bang | Token::Question | Token::Tilde | Token::Interrobang | Token::Lozenge
1207        )
1208    }
1209
1210    pub fn is_legion_morpheme(&self) -> bool {
1211        matches!(
1212            self,
1213            Token::LegionField      // ∿ - collective memory
1214                | Token::DirectSum  // ⊕ - superposition
1215                | Token::Interfere  // ⫰ - interference
1216                | Token::ConfidenceHigh  // ◉ - resonance (dual-purpose)
1217                | Token::Distribute // ⟁ - holographic distribution
1218                | Token::Gather     // ⟀ - interference gathering
1219                | Token::Broadcast  // ↠ - one-to-many
1220                | Token::Consensus  // ⇢ - many-to-one
1221                | Token::Partial    // ∂ - decay
1222        )
1223    }
1224
1225    pub fn is_legion_assign(&self) -> bool {
1226        matches!(
1227            self,
1228            Token::DirectSumEq | Token::PartialEq_ | Token::InterfereEq
1229        )
1230    }
1231
1232    pub fn is_affective(&self) -> bool {
1233        matches!(
1234            self,
1235            // Sentiment
1236            Token::DirectSum |  // ⊕ positive (dual-purpose with DirectSum)
1237            Token::AffectNegative |  // ⊖ negative
1238            Token::AffectNeutral |  // ⊜ neutral
1239            // Sarcasm
1240            Token::IronyMark |  // ⸮ irony/sarcasm
1241            // Intensity
1242            Token::IntensityUp |  // ↑
1243            Token::IntensityDown |  // ↓
1244            Token::IntensityMax |  // ⇈
1245            // Formality
1246            Token::FormalRegister |  // ♔
1247            Token::InformalRegister |  // ♟
1248            // Emotions
1249            Token::EmotionJoy |  // ☺
1250            Token::EmotionSadness |  // ☹
1251            Token::EmotionAnger |  // ⚡
1252            Token::EmotionFear |  // ❄
1253            Token::EmotionSurprise |  // ✦
1254            Token::EmotionLove |  // ♡
1255            // Confidence
1256            Token::ConfidenceHigh |  // ◉
1257            Token::ConfidenceMedium |  // ◎
1258            Token::ConfidenceLow // ○
1259        )
1260    }
1261
1262    pub fn is_sentiment(&self) -> bool {
1263        matches!(
1264            self,
1265            Token::DirectSum | Token::AffectNegative | Token::AffectNeutral
1266        )
1267    }
1268
1269    pub fn is_emotion(&self) -> bool {
1270        matches!(
1271            self,
1272            Token::EmotionJoy
1273                | Token::EmotionSadness
1274                | Token::EmotionAnger
1275                | Token::EmotionFear
1276                | Token::EmotionSurprise
1277                | Token::EmotionLove
1278        )
1279    }
1280
1281    pub fn is_intensity(&self) -> bool {
1282        matches!(
1283            self,
1284            Token::IntensityUp | Token::IntensityDown | Token::IntensityMax
1285        )
1286    }
1287}
1288
1289/// Lexer wrapping Logos for Sigil.
1290pub struct Lexer<'a> {
1291    inner: logos::Lexer<'a, Token>,
1292    /// Buffer for lookahead tokens (supports multi-token peek)
1293    buffer: Vec<Option<(Token, Span)>>,
1294}
1295
1296impl<'a> Lexer<'a> {
1297    pub fn new(source: &'a str) -> Self {
1298        Self {
1299            inner: Token::lexer(source),
1300            buffer: Vec::new(),
1301        }
1302    }
1303
1304    /// Read the next token from the underlying logos lexer
1305    fn read_next(&mut self) -> Option<(Token, Span)> {
1306        match self.inner.next() {
1307            Some(Ok(token)) => {
1308                let span = self.inner.span();
1309                Some((token, Span::new(span.start, span.end)))
1310            }
1311            Some(Err(_)) => {
1312                // Skip invalid tokens and try next
1313                self.read_next()
1314            }
1315            None => None,
1316        }
1317    }
1318
1319    pub fn next_token(&mut self) -> Option<(Token, Span)> {
1320        if !self.buffer.is_empty() {
1321            // Return from buffer (front = next token)
1322            // Each buffer element is Option<(Token, Span)> where None = EOF
1323            return self.buffer.remove(0);
1324        }
1325        self.read_next()
1326    }
1327
1328    pub fn peek(&mut self) -> Option<&(Token, Span)> {
1329        self.peek_n(0)
1330    }
1331
1332    /// Peek n tokens ahead (0 = next token, 1 = token after that, etc.)
1333    pub fn peek_n(&mut self, n: usize) -> Option<&(Token, Span)> {
1334        // Fill buffer up to position n
1335        while self.buffer.len() <= n {
1336            let token = self.read_next();
1337            self.buffer.push(token);
1338        }
1339        self.buffer.get(n).and_then(|opt| opt.as_ref())
1340    }
1341
1342    pub fn span(&self) -> Span {
1343        let span = self.inner.span();
1344        Span::new(span.start, span.end)
1345    }
1346}
1347
1348#[cfg(test)]
1349mod tests {
1350    use super::*;
1351
1352    #[test]
1353    fn test_morphemes() {
1354        let mut lexer = Lexer::new("τ φ σ ρ λ Σ Π ⌛");
1355        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1356        assert!(matches!(lexer.next_token(), Some((Token::Phi, _))));
1357        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1358        assert!(matches!(lexer.next_token(), Some((Token::Rho, _))));
1359        assert!(matches!(lexer.next_token(), Some((Token::Lambda, _))));
1360        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1361        assert!(matches!(lexer.next_token(), Some((Token::Pi, _))));
1362        assert!(matches!(lexer.next_token(), Some((Token::Hourglass, _))));
1363    }
1364
1365    #[test]
1366    fn test_evidentiality() {
1367        let mut lexer = Lexer::new("value! uncertain? reported~ paradox‽");
1368        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "value"));
1369        assert!(matches!(lexer.next_token(), Some((Token::Bang, _))));
1370        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "uncertain"));
1371        assert!(matches!(lexer.next_token(), Some((Token::Question, _))));
1372        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "reported"));
1373        assert!(matches!(lexer.next_token(), Some((Token::Tilde, _))));
1374        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "paradox"));
1375        assert!(matches!(lexer.next_token(), Some((Token::Interrobang, _))));
1376    }
1377
1378    #[test]
1379    fn test_pipe_chain() {
1380        let mut lexer = Lexer::new("data|τ{f}|φ{p}|σ");
1381        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "data"));
1382        assert!(matches!(lexer.next_token(), Some((Token::Pipe, _))));
1383        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1384        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1385        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1386        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1387    }
1388
1389    #[test]
1390    fn test_numbers() {
1391        let mut lexer = Lexer::new("42 0b1010 0o52 0x2A 0v22 0s42 3.14");
1392        assert!(matches!(lexer.next_token(), Some((Token::IntLit(s), _)) if s == "42"));
1393        assert!(matches!(lexer.next_token(), Some((Token::BinaryLit(s), _)) if s == "0b1010"));
1394        assert!(matches!(lexer.next_token(), Some((Token::OctalLit(s), _)) if s == "0o52"));
1395        assert!(matches!(lexer.next_token(), Some((Token::HexLit(s), _)) if s == "0x2A"));
1396        assert!(matches!(lexer.next_token(), Some((Token::VigesimalLit(s), _)) if s == "0v22"));
1397        assert!(matches!(lexer.next_token(), Some((Token::SexagesimalLit(s), _)) if s == "0s42"));
1398        assert!(matches!(lexer.next_token(), Some((Token::FloatLit(s), _)) if s == "3.14"));
1399    }
1400
1401    #[test]
1402    fn test_incorporation() {
1403        let mut lexer = Lexer::new("file·open·read");
1404        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "file"));
1405        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1406        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "open"));
1407        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1408        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "read"));
1409    }
1410
1411    #[test]
1412    fn test_special_symbols() {
1413        let mut lexer = Lexer::new("∅ ◯ ∞");
1414        assert!(matches!(lexer.next_token(), Some((Token::Empty, _))));
1415        assert!(matches!(lexer.next_token(), Some((Token::Circle, _))));
1416        assert!(matches!(lexer.next_token(), Some((Token::Infinity, _))));
1417    }
1418
1419    #[test]
1420    fn test_quantifiers() {
1421        let mut lexer = Lexer::new("∀x ∃y x∈S y∉T");
1422        assert!(matches!(lexer.next_token(), Some((Token::ForAll, _))));
1423        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1424        assert!(matches!(lexer.next_token(), Some((Token::Exists, _))));
1425        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "y"));
1426        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1427        assert!(matches!(lexer.next_token(), Some((Token::ElementOf, _))));
1428    }
1429
1430    #[test]
1431    fn test_set_operations() {
1432        let mut lexer = Lexer::new("A∪B A∩B A∖B A⊂B A⊆B");
1433        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1434        assert!(matches!(lexer.next_token(), Some((Token::Union, _))));
1435        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "B"));
1436        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1437        assert!(matches!(lexer.next_token(), Some((Token::Intersection, _))));
1438    }
1439
1440    #[test]
1441    fn test_logic_operators() {
1442        let mut lexer = Lexer::new("p∧q p∨q ¬p p⊻q ⊤ ⊥");
1443        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1444        assert!(matches!(lexer.next_token(), Some((Token::LogicAnd, _))));
1445        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "q"));
1446        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1447        assert!(matches!(lexer.next_token(), Some((Token::LogicOr, _))));
1448    }
1449
1450    #[test]
1451    fn test_analysis_operators() {
1452        let mut lexer = Lexer::new("∫f ∂g √x ∛y f∘g");
1453        assert!(matches!(lexer.next_token(), Some((Token::Integral, _))));
1454        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1455        assert!(matches!(lexer.next_token(), Some((Token::Partial, _))));
1456        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "g"));
1457        assert!(matches!(lexer.next_token(), Some((Token::Sqrt, _))));
1458    }
1459
1460    #[test]
1461    fn test_additional_morphemes() {
1462        let mut lexer = Lexer::new("δ ε ω α ζ");
1463        assert!(matches!(lexer.next_token(), Some((Token::Delta, _))));
1464        assert!(matches!(lexer.next_token(), Some((Token::Epsilon, _))));
1465        assert!(matches!(lexer.next_token(), Some((Token::Omega, _))));
1466        assert!(matches!(lexer.next_token(), Some((Token::Alpha, _))));
1467        assert!(matches!(lexer.next_token(), Some((Token::Zeta, _))));
1468    }
1469
1470    #[test]
1471    fn test_ffi_keywords() {
1472        let mut lexer = Lexer::new("extern unsafe");
1473        assert!(matches!(lexer.next_token(), Some((Token::Extern, _))));
1474        assert!(matches!(lexer.next_token(), Some((Token::Unsafe, _))));
1475    }
1476
1477    #[test]
1478    fn test_parallel_morphemes() {
1479        let mut lexer = Lexer::new("∥ parallel ⊛ gpu");
1480        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1481        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1482        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1483        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1484    }
1485
1486    #[test]
1487    fn test_lifetime_labels() {
1488        // Test loop labels
1489        let mut lexer = Lexer::new("'outer: loop { break 'outer }");
1490        assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1491        assert!(matches!(lexer.next_token(), Some((Token::Colon, _))));
1492        assert!(matches!(lexer.next_token(), Some((Token::Loop, _))));
1493        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1494        assert!(matches!(lexer.next_token(), Some((Token::Break, _))));
1495        assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1496        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1497    }
1498
1499    // ==================== STRING LITERAL TESTS ====================
1500
1501    #[test]
1502    fn test_string_escape_sequences() {
1503        // Test basic escape sequences
1504        let mut lexer = Lexer::new(r#""hello\nworld""#);
1505        match lexer.next_token() {
1506            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\nworld"),
1507            other => panic!("Expected StringLit, got {:?}", other),
1508        }
1509
1510        // Test tab escape
1511        let mut lexer = Lexer::new(r#""hello\tworld""#);
1512        match lexer.next_token() {
1513            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\tworld"),
1514            other => panic!("Expected StringLit, got {:?}", other),
1515        }
1516
1517        // Test carriage return
1518        let mut lexer = Lexer::new(r#""hello\rworld""#);
1519        match lexer.next_token() {
1520            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\rworld"),
1521            other => panic!("Expected StringLit, got {:?}", other),
1522        }
1523
1524        // Test escaped backslash
1525        let mut lexer = Lexer::new(r#""hello\\world""#);
1526        match lexer.next_token() {
1527            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\\world"),
1528            other => panic!("Expected StringLit, got {:?}", other),
1529        }
1530
1531        // Test escaped quote
1532        let mut lexer = Lexer::new(r#""hello\"world""#);
1533        match lexer.next_token() {
1534            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\"world"),
1535            other => panic!("Expected StringLit, got {:?}", other),
1536        }
1537
1538        // Test null character
1539        let mut lexer = Lexer::new(r#""hello\0world""#);
1540        match lexer.next_token() {
1541            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\0world"),
1542            other => panic!("Expected StringLit, got {:?}", other),
1543        }
1544    }
1545
1546    #[test]
1547    fn test_string_hex_escape() {
1548        // Test \xNN hex escape
1549        let mut lexer = Lexer::new(r#""hello\x41world""#);
1550        match lexer.next_token() {
1551            Some((Token::StringLit(s), _)) => assert_eq!(s, "helloAworld"),
1552            other => panic!("Expected StringLit, got {:?}", other),
1553        }
1554    }
1555
1556    #[test]
1557    fn test_string_unicode_escape() {
1558        // Test \u{NNNN} Unicode escape
1559        let mut lexer = Lexer::new(r#""hello\u{1F600}world""#);
1560        match lexer.next_token() {
1561            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello😀world"),
1562            other => panic!("Expected StringLit, got {:?}", other),
1563        }
1564
1565        // Test Greek letter
1566        let mut lexer = Lexer::new(r#""\u{03C4}""#);
1567        match lexer.next_token() {
1568            Some((Token::StringLit(s), _)) => assert_eq!(s, "τ"),
1569            other => panic!("Expected StringLit, got {:?}", other),
1570        }
1571    }
1572
1573    #[test]
1574    fn test_char_escape_sequences() {
1575        let mut lexer = Lexer::new(r"'\n'");
1576        match lexer.next_token() {
1577            Some((Token::CharLit(c), _)) => assert_eq!(c, '\n'),
1578            other => panic!("Expected CharLit, got {:?}", other),
1579        }
1580
1581        let mut lexer = Lexer::new(r"'\t'");
1582        match lexer.next_token() {
1583            Some((Token::CharLit(c), _)) => assert_eq!(c, '\t'),
1584            other => panic!("Expected CharLit, got {:?}", other),
1585        }
1586
1587        let mut lexer = Lexer::new(r"'\\'");
1588        match lexer.next_token() {
1589            Some((Token::CharLit(c), _)) => assert_eq!(c, '\\'),
1590            other => panic!("Expected CharLit, got {:?}", other),
1591        }
1592    }
1593
1594    #[test]
1595    fn test_raw_string() {
1596        // Raw strings should NOT process escapes
1597        let mut lexer = Lexer::new(r#"r"hello\nworld""#);
1598        match lexer.next_token() {
1599            Some((Token::RawStringLit(s), _)) => assert_eq!(s, r"hello\nworld"),
1600            other => panic!("Expected RawStringLit, got {:?}", other),
1601        }
1602    }
1603
1604    #[test]
1605    fn test_raw_string_delimited() {
1606        // r#"..."# style
1607        let mut lexer = Lexer::new(r##"r#"hello "world""#"##);
1608        match lexer.next_token() {
1609            Some((Token::RawStringDelimited(s), _)) => assert_eq!(s, r#"hello "world""#),
1610            other => panic!("Expected RawStringDelimited, got {:?}", other),
1611        }
1612    }
1613
1614    #[test]
1615    fn test_byte_string() {
1616        let mut lexer = Lexer::new(r#"b"hello""#);
1617        match lexer.next_token() {
1618            Some((Token::ByteStringLit(bytes), _)) => {
1619                assert_eq!(bytes, vec![104, 101, 108, 108, 111]); // "hello" in ASCII
1620            }
1621            other => panic!("Expected ByteStringLit, got {:?}", other),
1622        }
1623    }
1624
1625    #[test]
1626    fn test_interpolated_string() {
1627        let mut lexer = Lexer::new(r#"f"hello {name}""#);
1628        match lexer.next_token() {
1629            Some((Token::InterpolatedStringLit(s), _)) => assert_eq!(s, "hello {name}"),
1630            other => panic!("Expected InterpolatedStringLit, got {:?}", other),
1631        }
1632    }
1633
1634    #[test]
1635    fn test_sigil_string_sql() {
1636        let mut lexer = Lexer::new(r#"σ"SELECT * FROM {table}""#);
1637        match lexer.next_token() {
1638            Some((Token::SigilStringSql(s), _)) => assert_eq!(s, "SELECT * FROM {table}"),
1639            other => panic!("Expected SigilStringSql, got {:?}", other),
1640        }
1641    }
1642
1643    #[test]
1644    fn test_sigil_string_route() {
1645        let mut lexer = Lexer::new(r#"ρ"/api/v1/{resource}/{id}""#);
1646        match lexer.next_token() {
1647            Some((Token::SigilStringRoute(s), _)) => assert_eq!(s, "/api/v1/{resource}/{id}"),
1648            other => panic!("Expected SigilStringRoute, got {:?}", other),
1649        }
1650    }
1651
1652    #[test]
1653    fn test_unicode_in_strings() {
1654        // Test direct Unicode in strings
1655        let mut lexer = Lexer::new(r#""τφσρ 你好 🦀""#);
1656        match lexer.next_token() {
1657            Some((Token::StringLit(s), _)) => assert_eq!(s, "τφσρ 你好 🦀"),
1658            other => panic!("Expected StringLit, got {:?}", other),
1659        }
1660    }
1661
1662    #[test]
1663    fn test_empty_string() {
1664        let mut lexer = Lexer::new(r#""""#);
1665        match lexer.next_token() {
1666            Some((Token::StringLit(s), _)) => assert_eq!(s, ""),
1667            other => panic!("Expected empty StringLit, got {:?}", other),
1668        }
1669    }
1670
1671    #[test]
1672    fn test_escape_sequence_helper() {
1673        // Unit test the helper function directly
1674        assert_eq!(process_escape_sequences(r"hello\nworld"), "hello\nworld");
1675        assert_eq!(process_escape_sequences(r"hello\tworld"), "hello\tworld");
1676        assert_eq!(process_escape_sequences(r"hello\\world"), "hello\\world");
1677        assert_eq!(process_escape_sequences(r#"hello\"world"#), "hello\"world");
1678        assert_eq!(process_escape_sequences(r"hello\x41world"), "helloAworld");
1679        assert_eq!(
1680            process_escape_sequences(r"hello\u{1F600}world"),
1681            "hello😀world"
1682        );
1683    }
1684}
sigil_parser/lexer.rs

sigil_parser/
lexer.rs