sigil_parser/
lexer.rs

1//! Lexer for the Sigil programming language.
2//!
3//! Handles polysynthetic morphemes, evidentiality markers, and multi-base numerals.
4
5use crate::span::Span;
6use logos::Logos;
7
8/// Process escape sequences in a string literal.
9/// Converts \n, \t, \r, \\, \", \', \0, \xNN, \u{NNNN} to their actual characters.
10/// Also handles line continuation: \<newline><whitespace> is stripped entirely.
11fn process_escape_sequences(s: &str) -> String {
12    let mut result = String::with_capacity(s.len());
13    let mut chars = s.chars().peekable();
14
15    while let Some(c) = chars.next() {
16        if c == '\\' {
17            match chars.next() {
18                Some('\n') => {
19                    // Line continuation: skip newline and any leading whitespace
20                    while let Some(&c) = chars.peek() {
21                        if c == ' ' || c == '\t' {
22                            chars.next();
23                        } else {
24                            break;
25                        }
26                    }
27                }
28                Some('n') => result.push('\n'),
29                Some('t') => result.push('\t'),
30                Some('r') => result.push('\r'),
31                Some('\\') => result.push('\\'),
32                Some('"') => result.push('"'),
33                Some('\'') => result.push('\''),
34                Some('0') => result.push('\0'),
35                Some('x') => {
36                    // \xNN - two hex digits
37                    let mut hex = String::new();
38                    for _ in 0..2 {
39                        if let Some(&c) = chars.peek() {
40                            if c.is_ascii_hexdigit() {
41                                hex.push(chars.next().unwrap());
42                            }
43                        }
44                    }
45                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
46                        result.push(val as char);
47                    }
48                }
49                Some('u') => {
50                    // \u{NNNN} - Unicode code point
51                    if chars.peek() == Some(&'{') {
52                        chars.next(); // consume '{'
53                        let mut hex = String::new();
54                        while let Some(&c) = chars.peek() {
55                            if c == '}' {
56                                chars.next();
57                                break;
58                            }
59                            if c.is_ascii_hexdigit() {
60                                hex.push(chars.next().unwrap());
61                            } else {
62                                break;
63                            }
64                        }
65                        if let Ok(val) = u32::from_str_radix(&hex, 16) {
66                            if let Some(c) = char::from_u32(val) {
67                                result.push(c);
68                            }
69                        }
70                    }
71                }
72                Some(other) => {
73                    // Unknown escape, keep as-is
74                    result.push('\\');
75                    result.push(other);
76                }
77                None => result.push('\\'),
78            }
79        } else {
80            result.push(c);
81        }
82    }
83    result
84}
85
86/// Process escape sequences in byte string literals, returning bytes.
87fn process_byte_escape_sequences(s: &str) -> Vec<u8> {
88    let mut result = Vec::with_capacity(s.len());
89    let mut chars = s.chars().peekable();
90
91    while let Some(c) = chars.next() {
92        if c == '\\' {
93            match chars.next() {
94                Some('\n') => {
95                    // Line continuation: skip newline and any leading whitespace
96                    while let Some(&c) = chars.peek() {
97                        if c == ' ' || c == '\t' {
98                            chars.next();
99                        } else {
100                            break;
101                        }
102                    }
103                }
104                Some('n') => result.push(b'\n'),
105                Some('t') => result.push(b'\t'),
106                Some('r') => result.push(b'\r'),
107                Some('\\') => result.push(b'\\'),
108                Some('"') => result.push(b'"'),
109                Some('\'') => result.push(b'\''),
110                Some('0') => result.push(0),
111                Some('x') => {
112                    // \xNN - two hex digits
113                    let mut hex = String::new();
114                    for _ in 0..2 {
115                        if let Some(&c) = chars.peek() {
116                            if c.is_ascii_hexdigit() {
117                                hex.push(chars.next().unwrap());
118                            }
119                        }
120                    }
121                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
122                        result.push(val);
123                    }
124                }
125                Some(other) => {
126                    // Unknown escape, keep as-is
127                    result.push(b'\\');
128                    if other.is_ascii() {
129                        result.push(other as u8);
130                    }
131                }
132                None => result.push(b'\\'),
133            }
134        } else if c.is_ascii() {
135            result.push(c as u8);
136        }
137        // Non-ASCII in byte strings is ignored (Rust doesn't allow it)
138    }
139    result
140}
141
142/// Callback for delimited raw strings (r#"..."#).
143/// Reads until the closing "# is found.
144/// Callback for block comments: /* ... */
145/// Consumes characters until */ is found
146fn block_comment_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
147    let remainder = lex.remainder();
148
149    // Find the closing */
150    if let Some(end_pos) = remainder.find("*/") {
151        let content = &remainder[..end_pos];
152        // Bump past content and closing */ (2 chars)
153        lex.bump(end_pos + 2);
154        Some(content.to_string())
155    } else {
156        // No closing */ found - consume rest as comment
157        let len = remainder.len();
158        lex.bump(len);
159        Some(remainder.to_string())
160    }
161}
162
163fn raw_string_delimited_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
164    let remainder = lex.remainder();
165
166    // Find the closing "#
167    if let Some(end_pos) = remainder.find("\"#") {
168        let content = &remainder[..end_pos];
169        // Bump past content and closing "# (2 chars)
170        lex.bump(end_pos + 2);
171        Some(content.to_string())
172    } else {
173        None
174    }
175}
176
177/// Callback for multi-line string literals.
178/// Reads from """ until the next """ is found.
179fn multiline_string_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
180    let remainder = lex.remainder();
181
182    // Find the closing """
183    if let Some(end_pos) = remainder.find("\"\"\"") {
184        let content = &remainder[..end_pos];
185        // Bump the lexer past the content and closing quotes
186        lex.bump(end_pos + 3);
187        Some(process_escape_sequences(content))
188    } else {
189        // No closing """ found - skip to end and return what we have
190        None
191    }
192}
193
194/// Process escape sequences in a character literal.
195fn process_char_escape(s: &str) -> char {
196    let mut chars = s.chars();
197    match chars.next() {
198        Some('\\') => match chars.next() {
199            Some('n') => '\n',
200            Some('t') => '\t',
201            Some('r') => '\r',
202            Some('\\') => '\\',
203            Some('"') => '"',
204            Some('\'') => '\'',
205            Some('0') => '\0',
206            Some('x') => {
207                let hex: String = chars.take(2).collect();
208                u8::from_str_radix(&hex, 16)
209                    .map(|v| v as char)
210                    .unwrap_or('?')
211            }
212            Some('u') => {
213                if chars.next() == Some('{') {
214                    let hex: String = chars.take_while(|&c| c != '}').collect();
215                    u32::from_str_radix(&hex, 16)
216                        .ok()
217                        .and_then(char::from_u32)
218                        .unwrap_or('?')
219                } else {
220                    '?'
221                }
222            }
223            Some(c) => c,
224            None => '?',
225        },
226        Some(c) => c,
227        None => '?',
228    }
229}
230
231/// Process escape sequences in a byte character literal (b'x').
232fn process_byte_char_escape(s: &str) -> u8 {
233    let mut chars = s.chars();
234    match chars.next() {
235        Some('\\') => match chars.next() {
236            Some('n') => b'\n',
237            Some('t') => b'\t',
238            Some('r') => b'\r',
239            Some('\\') => b'\\',
240            Some('"') => b'"',
241            Some('\'') => b'\'',
242            Some('0') => b'\0',
243            Some('x') => {
244                let hex: String = chars.take(2).collect();
245                u8::from_str_radix(&hex, 16).unwrap_or(b'?')
246            }
247            Some(c) => c as u8,
248            None => b'?',
249        },
250        Some(c) => c as u8,
251        None => b'?',
252    }
253}
254
255/// Token types for Sigil.
256#[derive(Logos, Debug, Clone, PartialEq)]
257#[logos(skip r"[ \t\r\n\f]+")]
258pub enum Token {
259    // === Comments ===
260    #[regex(r"//[^\n]*", |lex| lex.slice().to_string())]
261    LineComment(String),
262
263    #[regex(r"//![^\n]*", |lex| lex.slice().to_string())]
264    DocComment(String),
265
266    // Tilde comment style: ~~ ... ~~
267    #[regex(r"~~[^\n]*", |lex| lex.slice().to_string())]
268    TildeComment(String),
269
270    // Block comment: /* ... */ (non-nested)
271    #[token("/*", block_comment_callback)]
272    BlockComment(String),
273
274    // === Keywords ===
275    #[token("fn")]
276    Fn,
277    #[token("async")]
278    Async,
279    #[token("let")]
280    Let,
281    #[token("mut")]
282    Mut,
283    #[token("const")]
284    Const,
285    #[token("type")]
286    Type,
287    #[token("struct")]
288    #[token("sigil")] // Alternative syntax for struct
289    Struct,
290    #[token("enum")]
291    Enum,
292    #[token("trait")]
293    Trait,
294    #[token("impl")]
295    Impl,
296    #[token("mod")]
297    #[token("scroll")] // Sigil-native: scroll = mod
298    Mod,
299    #[token("use")]
300    #[token("invoke")] // Sigil-native: invoke = use
301    Use,
302    #[token("pub")]
303    Pub,
304    #[token("actor")]
305    Actor,
306    #[token("saga")]
307    Saga,
308    #[token("scope")]
309    Scope,
310    #[token("rune")]
311    Rune,
312    #[token("macro")]
313    Macro,
314    #[token("macro_rules")]
315    MacroRules,
316
317    // Control flow
318    #[token("if")]
319    If,
320    #[token("else")]
321    Else,
322    #[token("match")]
323    Match,
324    #[token("loop")]
325    Loop,
326    #[token("while")]
327    While,
328    #[token("for")]
329    For,
330    #[token("in")]
331    In,
332    #[token("break")]
333    Break,
334    #[token("continue")]
335    Continue,
336    #[token("return")]
337    Return,
338    #[token("yield")]
339    Yield,
340    #[token("await")]
341    Await,
342
343    // Other keywords
344    #[token("self")]
345    SelfLower,
346    #[token("Self")]
347    SelfUpper,
348    #[token("super")]
349    Super,
350    #[token("crate")]
351    #[token("tome")] // Sigil-native: tome = crate
352    Crate,
353    #[token("where")]
354    Where,
355    #[token("as")]
356    As,
357    #[token("dyn")]
358    Dyn,
359    #[token("move")]
360    Move,
361    #[token("ref")]
362    Ref,
363    #[token("static")]
364    Static,
365    #[token("unsafe")]
366    Unsafe,
367    #[token("extern")]
368    Extern,
369    #[token("asm")]
370    Asm,
371    #[token("volatile")]
372    Volatile,
373    #[token("naked")]
374    Naked,
375    #[token("packed")]
376    Packed,
377    #[token("simd")]
378    Simd,
379    #[token("atomic")]
380    Atomic,
381    #[token("derive")]
382    Derive,
383    #[token("on")]
384    On,
385
386    // Plurality keywords (DAEMONIORUM extensions)
387    #[token("alter")]
388    Alter,
389    #[token("switch")]
390    Switch,
391    #[token("headspace")]
392    Headspace,
393    #[token("cocon")]
394    CoCon,
395    #[token("reality")]
396    Reality,
397    #[token("split")]
398    Split,
399    #[token("trigger")]
400    Trigger,
401    #[token("layer")]
402    Layer,
403    #[token("location")]
404    Location,
405    #[token("states")]
406    States,
407    #[token("anima")]
408    Anima,
409    #[token("to")]
410    To,
411    #[token("from")]
412    From,
413
414    // Alter-source markers (compound tokens)
415    #[token("@!")]
416    AlterSourceFronting,
417    #[token("@~")]
418    AlterSourceCoCon,
419    #[token("@?")]
420    AlterSourceDormant,
421    #[token("@‽")]
422    AlterSourceBlended,
423
424    // Boolean literals
425    #[token("true")]
426    True,
427    #[token("false")]
428    False,
429
430    // Null literal
431    #[token("null")]
432    Null,
433
434    // === Morphemes (Greek letters) ===
435    #[token("τ")]
436    #[token("Τ")]
437    Tau, // Transform/map
438
439    #[token("φ")]
440    #[token("Φ")]
441    Phi, // Filter
442
443    #[token("σ")]
444    #[token("Σ")]
445    Sigma, // Sort (lowercase) / Sum (uppercase)
446
447    #[token("ρ")]
448    #[token("Ρ")]
449    Rho, // Reduce
450
451    #[token("λ")]
452    #[token("Λ")]
453    Lambda, // Lambda
454
455    #[token("Π")]
456    Pi, // Product
457
458    #[token("⌛")]
459    Hourglass, // Await symbol
460
461    // Additional morphemes
462    #[token("δ")]
463    #[token("Δ")]
464    Delta, // Difference/change
465
466    #[token("ε")]
467    Epsilon, // Empty/null
468
469    #[token("ω")]
470    #[token("Ω")]
471    Omega, // End/terminal
472
473    #[token("α")]
474    Alpha, // First element
475
476    #[token("ζ")]
477    Zeta, // Zip/combine
478
479    // === Additional Access Morphemes ===
480    #[token("μ")]
481    #[token("Μ")]
482    Mu, // Middle/median element
483
484    #[token("χ")]
485    #[token("Χ")]
486    Chi, // Random/choice (from chaos)
487
488    #[token("ν")]
489    #[token("Ν")]
490    Nu, // Nth element (ordinal)
491
492    #[token("ξ")]
493    #[token("Ξ")]
494    Xi, // Next in sequence
495
496    #[token("ψ")]
497    #[token("Ψ")]
498    Psi, // Psychological/mental state
499
500    #[token("θ")]
501    #[token("Θ")]
502    Theta, // Threshold/angle
503
504    #[token("κ")]
505    #[token("Κ")]
506    Kappa, // Callback/continuation
507
508    // === Parallel/Concurrency Morphemes ===
509    #[token("∥")]
510    #[token("parallel")]
511    Parallel, // Parallel execution (U+2225)
512
513    #[token("⊛")]
514    #[token("gpu")]
515    Gpu, // GPU compute shader (U+229B - circled asterisk)
516
517    // === Quantifiers (for AI-native set operations) ===
518    #[token("∀")]
519    ForAll, // Universal quantification
520
521    #[token("∃")]
522    Exists, // Existential quantification
523
524    #[token("∈")]
525    ElementOf, // Membership test
526
527    #[token("∉")]
528    NotElementOf, // Non-membership
529
530    // === Set Operations ===
531    #[token("∪")]
532    Union, // Set union
533
534    #[token("∩")]
535    Intersection, // Set intersection
536
537    #[token("∖")]
538    SetMinus, // Set difference
539
540    #[token("⊂")]
541    Subset, // Proper subset
542
543    #[token("⊆")]
544    SubsetEq, // Subset or equal
545
546    #[token("⊃")]
547    Superset, // Proper superset
548
549    #[token("⊇")]
550    SupersetEq, // Superset or equal
551
552    // === Logic Operators ===
553    #[token("∧")]
554    LogicAnd, // Logical conjunction
555
556    #[token("∨")]
557    LogicOr, // Logical disjunction
558
559    #[token("¬")]
560    LogicNot, // Logical negation
561
562    #[token("⊻")]
563    LogicXor, // Exclusive or
564
565    #[token("⊤")]
566    Top, // True/any type
567
568    #[token("⊥")]
569    Bottom, // False/never type
570
571    // === Bitwise Operators (Unicode) ===
572    #[token("⋏")]
573    BitwiseAndSymbol, // Bitwise AND (U+22CF)
574
575    #[token("⋎")]
576    BitwiseOrSymbol, // Bitwise OR (U+22CE)
577
578    #[token("⊙")]
579    CircledDot, // Hadamard product / element-wise multiply (U+2299)
580
581    // Note: ⊗ (tensor product) is already defined as Token::Tensor below
582
583    // === Type Theory ===
584    #[token("∷")]
585    TypeAnnotation, // Type annotation (alternative to :)
586
587    // === Analysis/Calculus ===
588    #[token("∫")]
589    Integral, // Cumulative sum
590
591    #[token("∂")]
592    Partial, // Discrete derivative
593
594    #[token("√")]
595    Sqrt, // Square root
596
597    #[token("∛")]
598    Cbrt, // Cube root
599
600    #[token("∇")]
601    Nabla, // Gradient (U+2207)
602
603    // === APL-Inspired Symbols ===
604    #[token("⍋")]
605    GradeUp, // Sort ascending (U+234B)
606
607    #[token("⍒")]
608    GradeDown, // Sort descending (U+2352)
609
610    #[token("⌽")]
611    Rotate, // Reverse/rotate (U+233D)
612
613    #[token("↻")]
614    CycleArrow, // Cycle/repeat (U+21BB)
615
616    #[token("⌺")]
617    QuadDiamond, // Windows/stencil (U+233A)
618
619    #[token("⊞")]
620    SquaredPlus, // Chunks (U+229E)
621
622    #[token("⍳")]
623    Iota, // Enumerate/index (U+2373)
624
625    // === Category Theory ===
626    #[token("∘")]
627    Compose, // Function composition
628
629    #[token("⊗")]
630    Tensor, // Tensor product
631
632    #[token("⊕")]
633    DirectSum, // Direct sum / XOR
634
635    // === Data Operations ===
636    #[token("⋈")]
637    Bowtie, // Join/zip combining (U+22C8)
638
639    #[token("⋳")]
640    ElementSmallVerticalBar, // Flatten (U+22F3)
641
642    #[token("⊔")]
643    SquareCup, // Lattice join / supremum (U+2294)
644
645    #[token("⊓")]
646    SquareCap, // Lattice meet / infimum (U+2293)
647
648    // === Evidentiality Markers ===
649    // Note: These are handled contextually since ! and ? have other uses
650    #[token("‽")]
651    Interrobang, // Paradox/trust boundary (U+203D)
652
653    #[token("◊")]
654    Lozenge, // Predicted/speculative (U+25CA) - Token◊
655
656    // === Legion Morphemes (Holographic Agent Collective) ===
657    // From Infernum 2.0 - distributed memory and multi-agent coordination
658    #[token("∿")]
659    #[token("legion_field")]
660    LegionField, // Collective memory substrate (U+223F sine wave) - memory∿
661
662    #[token("⫰")]
663    #[token("interfere")]
664    Interfere, // Interference query (U+2AF0) - query ⫰ field∿
665
666    #[token("⟁")]
667    #[token("distribute")]
668    Distribute, // Holographic distribution (U+27C1) - task ⟁ 8
669
670    #[token("⟀")]
671    #[token("gather")]
672    Gather, // Interference gathering (U+27C0) - fragments ⟀
673
674    #[token("↠")]
675    #[token("broadcast")]
676    Broadcast, // One-to-many broadcast (U+21A0) - signal ↠ legion
677
678    #[token("⇢")]
679    #[token("consensus")]
680    Consensus, // Many-to-one consensus (U+21E2) - contributions ⇢
681
682    // Compound Legion operators
683    #[token("⊕=")]
684    DirectSumEq, // Superposition assign - field∿ ⊕= pattern
685
686    #[token("∂=")]
687    PartialEq_, // Decay assign - field∿ ∂= 0.95 (renamed to avoid std conflict)
688
689    #[token("⫰=")]
690    InterfereEq, // Interference assign
691
692    // === Affective Markers (Sentiment & Emotion) ===
693    // Sentiment polarity
694    #[token("⊖")]
695    AffectNegative, // Negative sentiment (U+2296 Circled Minus)
696
697    #[token("⊜")]
698    AffectNeutral, // Neutral sentiment (U+229C Circled Equals)
699
700    // Note: ⊕ (U+2295) is already DirectSum - we'll use it dual-purpose for positive sentiment
701
702    // Sarcasm/Irony
703    #[token("⸮")]
704    IronyMark, // Irony/sarcasm marker (U+2E2E - historical percontation point!)
705
706    // Intensity modifiers
707    #[token("↑")]
708    IntensityUp, // Intensifier (U+2191)
709
710    #[token("↓")]
711    IntensityDown, // Dampener (U+2193)
712
713    #[token("⇈")]
714    IntensityMax, // Maximum intensity (U+21C8)
715
716    // Formality register
717    #[token("♔")]
718    FormalRegister, // Formal (U+2654 White King)
719
720    #[token("♟")]
721    InformalRegister, // Informal (U+265F Black Pawn)
722
723    // Emotion markers (Plutchik's wheel)
724    #[token("☺")]
725    EmotionJoy, // Joy (U+263A)
726
727    #[token("☹")]
728    EmotionSadness, // Sadness (U+2639)
729
730    #[token("⚡")]
731    EmotionAnger, // Anger (U+26A1)
732
733    #[token("❄")]
734    EmotionFear, // Fear (U+2744)
735
736    #[token("✦")]
737    EmotionSurprise, // Surprise (U+2726)
738
739    #[token("♡")]
740    EmotionLove, // Love/Trust (U+2661)
741
742    // Confidence markers
743    #[token("◉")]
744    ConfidenceHigh, // High confidence (U+25C9)
745
746    #[token("◎")]
747    ConfidenceMedium, // Medium confidence (U+25CE)
748
749    #[token("○")]
750    ConfidenceLow, // Low confidence (U+25CB)
751
752    // === Aspect Morphemes (verb aspects) ===
753    #[token("·ing")]
754    AspectProgressive, // Ongoing/streaming aspect
755
756    #[token("·ed")]
757    AspectPerfective, // Completed aspect
758
759    #[token("·able")]
760    AspectPotential, // Capability aspect
761
762    #[token("·ive")]
763    AspectResultative, // Result-producing aspect
764
765    // === Operators ===
766    #[token("|")]
767    Pipe,
768    #[token("·")]
769    MiddleDot, // Incorporation
770    #[token("->")]
771    Arrow,
772    #[token("=>")]
773    FatArrow,
774    #[token("<-")]
775    LeftArrow,
776    #[token("==")]
777    EqEq,
778    #[token("!=")]
779    NotEq,
780    #[token("<=")]
781    LtEq,
782    #[token(">=")]
783    GtEq,
784    #[token("<")]
785    Lt,
786    #[token(">")]
787    Gt,
788    #[token("+")]
789    Plus,
790    #[token("-")]
791    Minus,
792    #[token("*")]
793    Star,
794    #[token("/")]
795    Slash,
796    #[token("%")]
797    Percent,
798    #[token("**")]
799    StarStar, // Exponentiation
800    #[token("&&")]
801    AndAnd,
802    #[token("||")]
803    OrOr,
804    #[token("!")]
805    Bang, // Evidentiality: known / logical not
806    #[token("?")]
807    Question, // Evidentiality: uncertain / try
808    #[token("~")]
809    Tilde, // Evidentiality: reported
810    #[token("&")]
811    Amp,
812    #[token("^")]
813    Caret,
814    #[token("<<=")]
815    ShlEq,
816    #[token(">>=")]
817    ShrEq,
818    #[token("<<")]
819    Shl,
820    #[token(">>")]
821    Shr,
822    #[token("=")]
823    Eq,
824    #[token("+=")]
825    PlusEq,
826    #[token("-=")]
827    MinusEq,
828    #[token("*=")]
829    StarEq,
830    #[token("/=")]
831    SlashEq,
832    #[token("%=")]
833    PercentEq,
834    #[token("|=")]
835    PipeEq,
836    #[token("&=")]
837    AmpEq,
838    #[token("^=")]
839    CaretEq,
840    #[token("..")]
841    DotDot,
842    #[token("..=")]
843    DotDotEq,
844    #[token("++")]
845    PlusPlus, // Concatenation
846    #[token("::")]
847    ColonColon,
848    #[token(":")]
849    Colon,
850    #[token(";")]
851    Semi,
852    #[token(",")]
853    Comma,
854    #[token(".")]
855    Dot,
856    #[token("@")]
857    At,
858    #[token("#!")]
859    HashBang, // Inner attribute prefix #![...]
860    #[token("#")]
861    Hash,
862    #[token("_", priority = 3)]
863    Underscore,
864
865    // === Delimiters ===
866    #[token("(")]
867    LParen,
868    #[token(")")]
869    RParen,
870    #[token("{")]
871    LBrace,
872    #[token("}")]
873    RBrace,
874    #[token("[")]
875    LBracket,
876    #[token("]")]
877    RBracket,
878
879    // === Special symbols ===
880    #[token("∅")]
881    Empty, // Void/emptiness (śūnya)
882    #[token("◯")]
883    Circle, // Geometric zero
884    #[token("∞")]
885    Infinity, // Ananta
886
887    // === Protocol Operations (Sigil-native networking) ===
888    #[token("⇒")]
889    ProtoSend, // Send data (U+21D2 - rightwards double arrow)
890
891    #[token("⇐")]
892    ProtoRecv, // Receive data (U+21D0 - leftwards double arrow)
893
894    #[token("≋")]
895    ProtoStream, // Stream data (U+224B - triple tilde)
896
897    #[token("⊸")]
898    ProtoConnect, // Connect/lollipop (U+22B8 - multimap)
899
900    #[token("⏱")]
901    ProtoTimeout, // Timeout (U+23F1 - stopwatch)
902
903    // Note: ⊗ (Tensor) is used for close in protocol contexts
904
905    // Protocol keywords for ASCII fallback
906    #[token("send")]
907    Send,
908    #[token("recv")]
909    Recv,
910    #[token("stream")]
911    Stream,
912    #[token("connect")]
913    Connect,
914    #[token("close")]
915    Close,
916    #[token("timeout")]
917    Timeout,
918    #[token("retry")]
919    Retry,
920    #[token("header")]
921    Header,
922    #[token("body")]
923    Body,
924
925    // Protocol type identifiers (for incorporation: http·, ws·, grpc·, kafka·)
926    #[token("http")]
927    Http,
928    #[token("https")]
929    Https,
930    #[token("ws")]
931    Ws,
932    #[token("wss")]
933    Wss,
934    #[token("grpc")]
935    Grpc,
936    #[token("kafka")]
937    Kafka,
938    #[token("amqp")]
939    Amqp,
940    #[token("graphql")]
941    GraphQL,
942
943    // === Numbers ===
944    // Binary: 0b... with optional type suffix
945    #[regex(r"0b[01_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
946    BinaryLit(String),
947
948    // Octal: 0o... with optional type suffix
949    #[regex(r"0o[0-7_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
950    OctalLit(String),
951
952    // Hex: 0x... with optional type suffix
953    #[regex(r"0x[0-9a-fA-F_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
954    HexLit(String),
955
956    // Vigesimal: 0v... (base 20)
957    #[regex(r"0v[0-9a-jA-J_]+", |lex| lex.slice().to_string())]
958    VigesimalLit(String),
959
960    // Sexagesimal: 0s... (base 60)
961    #[regex(r"0s[0-9a-zA-Z_]+", |lex| lex.slice().to_string())]
962    SexagesimalLit(String),
963
964    // Duodecimal: 0z... (base 12)
965    #[regex(r"0z[0-9a-bA-B_]+", |lex| lex.slice().to_string())]
966    DuodecimalLit(String),
967
968    // Float: 123.456 or 1.23e10 or 1e-15 (with or without decimal point if exponent present)
969    // Optional type suffix: f16, f32, f64, f128
970    #[regex(r"([0-9][0-9_]*\.[0-9][0-9_]*([eE][+-]?[0-9_]+)?|[0-9][0-9_]*[eE][+-]?[0-9_]+)(f16|f32|f64|f128)?", |lex| lex.slice().to_string())]
971    FloatLit(String),
972
973    // Integer: 123 with optional type suffix (i8, i16, i32, i64, i128, isize, u8, u16, u32, u64, u128, usize)
974    #[regex(r"[0-9][0-9_]*(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
975    IntLit(String),
976
977    // === Strings ===
978    // Regular string with escape sequence processing
979    // Note: \\(.|\n) handles both regular escapes and line continuation (\ at end of line)
980    #[regex(r#""([^"\\]|\\(.|\n))*""#, |lex| {
981        let s = lex.slice();
982        let inner = &s[1..s.len()-1];
983        process_escape_sequences(inner)
984    })]
985    StringLit(String),
986
987    // Multi-line string (triple-quoted) - handled via callback
988    #[token(r#"""""#, multiline_string_callback)]
989    MultiLineStringLit(String),
990
991    // Byte string literal
992    #[regex(r#"b"([^"\\]|\\.)*""#, |lex| {
993        let s = lex.slice();
994        let inner = &s[2..s.len()-1];
995        process_byte_escape_sequences(inner)
996    })]
997    ByteStringLit(Vec<u8>),
998
999    // Interpolated string (will be parsed further for expressions)
1000    #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
1001        let s = lex.slice();
1002        let inner = &s[2..s.len()-1];
1003        process_escape_sequences(inner)
1004    })]
1005    InterpolatedStringLit(String),
1006
1007    // Sigil string - SQL template (σ prefix)
1008    #[regex(r#"σ"([^"\\]|\\.)*""#, |lex| {
1009        let s = lex.slice();
1010        // Get byte index after the σ character (which is 2 bytes in UTF-8)
1011        let start = "σ".len() + 1; // σ + opening quote
1012        let inner = &s[start..s.len()-1];
1013        process_escape_sequences(inner)
1014    })]
1015    SigilStringSql(String),
1016
1017    // Sigil string - Route template (ρ prefix)
1018    #[regex(r#"ρ"([^"\\]|\\.)*""#, |lex| {
1019        let s = lex.slice();
1020        // Get byte index after the ρ character (which is 2 bytes in UTF-8)
1021        let start = "ρ".len() + 1; // ρ + opening quote
1022        let inner = &s[start..s.len()-1];
1023        process_escape_sequences(inner)
1024    })]
1025    SigilStringRoute(String),
1026
1027    // Char literal with escape sequence processing
1028    // Matches: single char, hex escape \xNN, unicode escape \u{N...}, or simple escape \c
1029    #[regex(r"'([^'\\]|\\x[0-9a-fA-F]{2}|\\u\{[0-9a-fA-F]{1,6}\}|\\.)'", |lex| {
1030        let s = lex.slice();
1031        let inner = &s[1..s.len()-1];
1032        process_char_escape(inner)
1033    })]
1034    CharLit(char),
1035
1036    // Byte char literal (b'x' or b'\n')
1037    #[regex(r"b'([^'\\]|\\x[0-9a-fA-F]{2}|\\.)'", |lex| {
1038        let s = lex.slice();
1039        // Extract the character between b' and '
1040        let inner = &s[2..s.len()-1];
1041        process_byte_char_escape(inner)
1042    })]
1043    ByteCharLit(u8),
1044
1045    // Raw string (no escape processing, but allows \" for literal quotes in patterns)
1046    #[regex(r#"r"([^"\\]|\\.)*""#, |lex| {
1047        let s = lex.slice();
1048        s[2..s.len()-1].to_string()
1049    })]
1050    RawStringLit(String),
1051
1052    // Raw string with delimiter (r#"..."# style) - handles internal quotes
1053    #[token(r##"r#""##, raw_string_delimited_callback)]
1054    RawStringDelimited(String),
1055
1056    // === Lifetime/Label (for loop labels like 'outer: loop { break 'outer }) ===
1057    #[regex(r"'[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice()[1..].to_string())]
1058    Lifetime(String),
1059
1060    // === Identifiers ===
1061    // Includes Greek letters for polysynthetic identifiers like compute_ψ_state
1062    // Greek letters (both cases): αΑ, βΒ, γΓ, δΔ, εΕ, ζΖ, ηΗ, θΘ, ιΙ, κΚ, λΛ, μΜ, νΝ, ξΞ, οΟ, πΠ, ρΡ, σΣ, τΤ, υΥ, φΦ, χΧ, ψΨ, ωΩ
1063    #[regex(r"[a-zA-Z_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ][a-zA-Z0-9_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]*", |lex| lex.slice().to_string())]
1064    Ident(String),
1065
1066    // === Rune annotation ===
1067    #[regex(r"//@\s*rune:\s*[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
1068    RuneAnnotation(String),
1069}
1070
1071impl Token {
1072    pub fn is_keyword(&self) -> bool {
1073        matches!(
1074            self,
1075            Token::Fn
1076                | Token::Async
1077                | Token::Let
1078                | Token::Mut
1079                | Token::Const
1080                | Token::Type
1081                | Token::Struct
1082                | Token::Enum
1083                | Token::Trait
1084                | Token::Impl
1085                | Token::Mod
1086                | Token::Use
1087                | Token::Pub
1088                | Token::Actor
1089                | Token::Saga
1090                | Token::Scope
1091                | Token::Rune
1092                | Token::If
1093                | Token::Else
1094                | Token::Match
1095                | Token::Loop
1096                | Token::While
1097                | Token::For
1098                | Token::In
1099                | Token::Break
1100                | Token::Continue
1101                | Token::Return
1102                | Token::Yield
1103                | Token::Await
1104        ) || self.is_plurality_keyword()
1105    }
1106
1107    pub fn is_plurality_keyword(&self) -> bool {
1108        matches!(
1109            self,
1110            Token::Alter
1111                | Token::Switch
1112                | Token::Headspace
1113                | Token::CoCon
1114                | Token::Reality
1115                | Token::Split
1116                | Token::Trigger
1117                | Token::Layer
1118                | Token::Location
1119                | Token::States
1120                | Token::Anima
1121                | Token::To
1122                | Token::From
1123        )
1124    }
1125
1126    pub fn is_alter_source(&self) -> bool {
1127        matches!(
1128            self,
1129            Token::AlterSourceFronting
1130                | Token::AlterSourceCoCon
1131                | Token::AlterSourceDormant
1132                | Token::AlterSourceBlended
1133        )
1134    }
1135
1136    pub fn is_morpheme(&self) -> bool {
1137        matches!(
1138            self,
1139            Token::Tau | Token::Phi | Token::Sigma | Token::Rho |
1140            Token::Lambda | Token::Pi | Token::Hourglass |
1141            Token::Delta | Token::Epsilon | Token::Omega | Token::Alpha | Token::Zeta |
1142            Token::Mu | Token::Chi | Token::Nu | Token::Xi |  // Access morphemes
1143            Token::Parallel | Token::Gpu |  // Concurrency morphemes
1144            Token::Integral | Token::Partial | Token::Sqrt | Token::Cbrt |
1145            Token::Compose
1146        )
1147    }
1148
1149    pub fn is_aspect(&self) -> bool {
1150        matches!(
1151            self,
1152            Token::AspectProgressive
1153                | Token::AspectPerfective
1154                | Token::AspectPotential
1155                | Token::AspectResultative
1156        )
1157    }
1158
1159    pub fn is_data_op(&self) -> bool {
1160        matches!(
1161            self,
1162            Token::Bowtie | Token::ElementSmallVerticalBar | Token::SquareCup | Token::SquareCap
1163        )
1164    }
1165
1166    pub fn is_bitwise_symbol(&self) -> bool {
1167        matches!(self, Token::BitwiseAndSymbol | Token::BitwiseOrSymbol)
1168    }
1169
1170    pub fn is_quantifier(&self) -> bool {
1171        matches!(
1172            self,
1173            Token::ForAll | Token::Exists | Token::ElementOf | Token::NotElementOf
1174        )
1175    }
1176
1177    pub fn is_set_op(&self) -> bool {
1178        matches!(
1179            self,
1180            Token::Union
1181                | Token::Intersection
1182                | Token::SetMinus
1183                | Token::Subset
1184                | Token::SubsetEq
1185                | Token::Superset
1186                | Token::SupersetEq
1187        )
1188    }
1189
1190    pub fn is_logic_op(&self) -> bool {
1191        matches!(
1192            self,
1193            Token::LogicAnd
1194                | Token::LogicOr
1195                | Token::LogicNot
1196                | Token::LogicXor
1197                | Token::Top
1198                | Token::Bottom
1199        )
1200    }
1201
1202    pub fn is_evidentiality(&self) -> bool {
1203        matches!(
1204            self,
1205            Token::Bang | Token::Question | Token::Tilde | Token::Interrobang | Token::Lozenge
1206        )
1207    }
1208
1209    pub fn is_legion_morpheme(&self) -> bool {
1210        matches!(
1211            self,
1212            Token::LegionField      // ∿ - collective memory
1213                | Token::DirectSum  // ⊕ - superposition
1214                | Token::Interfere  // ⫰ - interference
1215                | Token::ConfidenceHigh  // ◉ - resonance (dual-purpose)
1216                | Token::Distribute // ⟁ - holographic distribution
1217                | Token::Gather     // ⟀ - interference gathering
1218                | Token::Broadcast  // ↠ - one-to-many
1219                | Token::Consensus  // ⇢ - many-to-one
1220                | Token::Partial // ∂ - decay
1221        )
1222    }
1223
1224    pub fn is_legion_assign(&self) -> bool {
1225        matches!(
1226            self,
1227            Token::DirectSumEq | Token::PartialEq_ | Token::InterfereEq
1228        )
1229    }
1230
1231    pub fn is_affective(&self) -> bool {
1232        matches!(
1233            self,
1234            // Sentiment
1235            Token::DirectSum |  // ⊕ positive (dual-purpose with DirectSum)
1236            Token::AffectNegative |  // ⊖ negative
1237            Token::AffectNeutral |  // ⊜ neutral
1238            // Sarcasm
1239            Token::IronyMark |  // ⸮ irony/sarcasm
1240            // Intensity
1241            Token::IntensityUp |  // ↑
1242            Token::IntensityDown |  // ↓
1243            Token::IntensityMax |  // ⇈
1244            // Formality
1245            Token::FormalRegister |  // ♔
1246            Token::InformalRegister |  // ♟
1247            // Emotions
1248            Token::EmotionJoy |  // ☺
1249            Token::EmotionSadness |  // ☹
1250            Token::EmotionAnger |  // ⚡
1251            Token::EmotionFear |  // ❄
1252            Token::EmotionSurprise |  // ✦
1253            Token::EmotionLove |  // ♡
1254            // Confidence
1255            Token::ConfidenceHigh |  // ◉
1256            Token::ConfidenceMedium |  // ◎
1257            Token::ConfidenceLow // ○
1258        )
1259    }
1260
1261    pub fn is_sentiment(&self) -> bool {
1262        matches!(
1263            self,
1264            Token::DirectSum | Token::AffectNegative | Token::AffectNeutral
1265        )
1266    }
1267
1268    pub fn is_emotion(&self) -> bool {
1269        matches!(
1270            self,
1271            Token::EmotionJoy
1272                | Token::EmotionSadness
1273                | Token::EmotionAnger
1274                | Token::EmotionFear
1275                | Token::EmotionSurprise
1276                | Token::EmotionLove
1277        )
1278    }
1279
1280    pub fn is_intensity(&self) -> bool {
1281        matches!(
1282            self,
1283            Token::IntensityUp | Token::IntensityDown | Token::IntensityMax
1284        )
1285    }
1286}
1287
1288/// Lexer wrapping Logos for Sigil.
1289pub struct Lexer<'a> {
1290    inner: logos::Lexer<'a, Token>,
1291    /// Buffer for lookahead tokens (supports multi-token peek)
1292    buffer: Vec<Option<(Token, Span)>>,
1293}
1294
1295impl<'a> Lexer<'a> {
1296    pub fn new(source: &'a str) -> Self {
1297        Self {
1298            inner: Token::lexer(source),
1299            buffer: Vec::new(),
1300        }
1301    }
1302
1303    /// Read the next token from the underlying logos lexer
1304    fn read_next(&mut self) -> Option<(Token, Span)> {
1305        match self.inner.next() {
1306            Some(Ok(token)) => {
1307                let span = self.inner.span();
1308                Some((token, Span::new(span.start, span.end)))
1309            }
1310            Some(Err(_)) => {
1311                // Skip invalid tokens and try next
1312                self.read_next()
1313            }
1314            None => None,
1315        }
1316    }
1317
1318    pub fn next_token(&mut self) -> Option<(Token, Span)> {
1319        if !self.buffer.is_empty() {
1320            // Return from buffer (front = next token)
1321            // Each buffer element is Option<(Token, Span)> where None = EOF
1322            return self.buffer.remove(0);
1323        }
1324        self.read_next()
1325    }
1326
1327    pub fn peek(&mut self) -> Option<&(Token, Span)> {
1328        self.peek_n(0)
1329    }
1330
1331    /// Peek n tokens ahead (0 = next token, 1 = token after that, etc.)
1332    pub fn peek_n(&mut self, n: usize) -> Option<&(Token, Span)> {
1333        // Fill buffer up to position n
1334        while self.buffer.len() <= n {
1335            let token = self.read_next();
1336            self.buffer.push(token);
1337        }
1338        self.buffer.get(n).and_then(|opt| opt.as_ref())
1339    }
1340
1341    pub fn span(&self) -> Span {
1342        let span = self.inner.span();
1343        Span::new(span.start, span.end)
1344    }
1345}
1346
1347#[cfg(test)]
1348mod tests {
1349    use super::*;
1350
1351    #[test]
1352    fn test_morphemes() {
1353        let mut lexer = Lexer::new("τ φ σ ρ λ Σ Π ⌛");
1354        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1355        assert!(matches!(lexer.next_token(), Some((Token::Phi, _))));
1356        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1357        assert!(matches!(lexer.next_token(), Some((Token::Rho, _))));
1358        assert!(matches!(lexer.next_token(), Some((Token::Lambda, _))));
1359        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1360        assert!(matches!(lexer.next_token(), Some((Token::Pi, _))));
1361        assert!(matches!(lexer.next_token(), Some((Token::Hourglass, _))));
1362    }
1363
1364    #[test]
1365    fn test_evidentiality() {
1366        let mut lexer = Lexer::new("value! uncertain? reported~ paradox‽");
1367        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "value"));
1368        assert!(matches!(lexer.next_token(), Some((Token::Bang, _))));
1369        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "uncertain"));
1370        assert!(matches!(lexer.next_token(), Some((Token::Question, _))));
1371        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "reported"));
1372        assert!(matches!(lexer.next_token(), Some((Token::Tilde, _))));
1373        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "paradox"));
1374        assert!(matches!(lexer.next_token(), Some((Token::Interrobang, _))));
1375    }
1376
1377    #[test]
1378    fn test_pipe_chain() {
1379        let mut lexer = Lexer::new("data|τ{f}|φ{p}|σ");
1380        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "data"));
1381        assert!(matches!(lexer.next_token(), Some((Token::Pipe, _))));
1382        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1383        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1384        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1385        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1386    }
1387
1388    #[test]
1389    fn test_numbers() {
1390        let mut lexer = Lexer::new("42 0b1010 0o52 0x2A 0v22 0s42 3.14");
1391        assert!(matches!(lexer.next_token(), Some((Token::IntLit(s), _)) if s == "42"));
1392        assert!(matches!(lexer.next_token(), Some((Token::BinaryLit(s), _)) if s == "0b1010"));
1393        assert!(matches!(lexer.next_token(), Some((Token::OctalLit(s), _)) if s == "0o52"));
1394        assert!(matches!(lexer.next_token(), Some((Token::HexLit(s), _)) if s == "0x2A"));
1395        assert!(matches!(lexer.next_token(), Some((Token::VigesimalLit(s), _)) if s == "0v22"));
1396        assert!(matches!(lexer.next_token(), Some((Token::SexagesimalLit(s), _)) if s == "0s42"));
1397        assert!(matches!(lexer.next_token(), Some((Token::FloatLit(s), _)) if s == "3.14"));
1398    }
1399
1400    #[test]
1401    fn test_incorporation() {
1402        let mut lexer = Lexer::new("file·open·read");
1403        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "file"));
1404        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1405        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "open"));
1406        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1407        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "read"));
1408    }
1409
1410    #[test]
1411    fn test_special_symbols() {
1412        let mut lexer = Lexer::new("∅ ◯ ∞");
1413        assert!(matches!(lexer.next_token(), Some((Token::Empty, _))));
1414        assert!(matches!(lexer.next_token(), Some((Token::Circle, _))));
1415        assert!(matches!(lexer.next_token(), Some((Token::Infinity, _))));
1416    }
1417
1418    #[test]
1419    fn test_quantifiers() {
1420        let mut lexer = Lexer::new("∀x ∃y x∈S y∉T");
1421        assert!(matches!(lexer.next_token(), Some((Token::ForAll, _))));
1422        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1423        assert!(matches!(lexer.next_token(), Some((Token::Exists, _))));
1424        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "y"));
1425        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1426        assert!(matches!(lexer.next_token(), Some((Token::ElementOf, _))));
1427    }
1428
1429    #[test]
1430    fn test_set_operations() {
1431        let mut lexer = Lexer::new("A∪B A∩B A∖B A⊂B A⊆B");
1432        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1433        assert!(matches!(lexer.next_token(), Some((Token::Union, _))));
1434        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "B"));
1435        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1436        assert!(matches!(lexer.next_token(), Some((Token::Intersection, _))));
1437    }
1438
1439    #[test]
1440    fn test_logic_operators() {
1441        let mut lexer = Lexer::new("p∧q p∨q ¬p p⊻q ⊤ ⊥");
1442        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1443        assert!(matches!(lexer.next_token(), Some((Token::LogicAnd, _))));
1444        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "q"));
1445        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1446        assert!(matches!(lexer.next_token(), Some((Token::LogicOr, _))));
1447    }
1448
1449    #[test]
1450    fn test_analysis_operators() {
1451        let mut lexer = Lexer::new("∫f ∂g √x ∛y f∘g");
1452        assert!(matches!(lexer.next_token(), Some((Token::Integral, _))));
1453        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1454        assert!(matches!(lexer.next_token(), Some((Token::Partial, _))));
1455        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "g"));
1456        assert!(matches!(lexer.next_token(), Some((Token::Sqrt, _))));
1457    }
1458
1459    #[test]
1460    fn test_additional_morphemes() {
1461        let mut lexer = Lexer::new("δ ε ω α ζ");
1462        assert!(matches!(lexer.next_token(), Some((Token::Delta, _))));
1463        assert!(matches!(lexer.next_token(), Some((Token::Epsilon, _))));
1464        assert!(matches!(lexer.next_token(), Some((Token::Omega, _))));
1465        assert!(matches!(lexer.next_token(), Some((Token::Alpha, _))));
1466        assert!(matches!(lexer.next_token(), Some((Token::Zeta, _))));
1467    }
1468
1469    #[test]
1470    fn test_ffi_keywords() {
1471        let mut lexer = Lexer::new("extern unsafe");
1472        assert!(matches!(lexer.next_token(), Some((Token::Extern, _))));
1473        assert!(matches!(lexer.next_token(), Some((Token::Unsafe, _))));
1474    }
1475
1476    #[test]
1477    fn test_parallel_morphemes() {
1478        let mut lexer = Lexer::new("∥ parallel ⊛ gpu");
1479        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1480        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1481        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1482        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1483    }
1484
1485    #[test]
1486    fn test_lifetime_labels() {
1487        // Test loop labels
1488        let mut lexer = Lexer::new("'outer: loop { break 'outer }");
1489        assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1490        assert!(matches!(lexer.next_token(), Some((Token::Colon, _))));
1491        assert!(matches!(lexer.next_token(), Some((Token::Loop, _))));
1492        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1493        assert!(matches!(lexer.next_token(), Some((Token::Break, _))));
1494        assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1495        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1496    }
1497
1498    // ==================== STRING LITERAL TESTS ====================
1499
1500    #[test]
1501    fn test_string_escape_sequences() {
1502        // Test basic escape sequences
1503        let mut lexer = Lexer::new(r#""hello\nworld""#);
1504        match lexer.next_token() {
1505            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\nworld"),
1506            other => panic!("Expected StringLit, got {:?}", other),
1507        }
1508
1509        // Test tab escape
1510        let mut lexer = Lexer::new(r#""hello\tworld""#);
1511        match lexer.next_token() {
1512            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\tworld"),
1513            other => panic!("Expected StringLit, got {:?}", other),
1514        }
1515
1516        // Test carriage return
1517        let mut lexer = Lexer::new(r#""hello\rworld""#);
1518        match lexer.next_token() {
1519            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\rworld"),
1520            other => panic!("Expected StringLit, got {:?}", other),
1521        }
1522
1523        // Test escaped backslash
1524        let mut lexer = Lexer::new(r#""hello\\world""#);
1525        match lexer.next_token() {
1526            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\\world"),
1527            other => panic!("Expected StringLit, got {:?}", other),
1528        }
1529
1530        // Test escaped quote
1531        let mut lexer = Lexer::new(r#""hello\"world""#);
1532        match lexer.next_token() {
1533            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\"world"),
1534            other => panic!("Expected StringLit, got {:?}", other),
1535        }
1536
1537        // Test null character
1538        let mut lexer = Lexer::new(r#""hello\0world""#);
1539        match lexer.next_token() {
1540            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\0world"),
1541            other => panic!("Expected StringLit, got {:?}", other),
1542        }
1543    }
1544
1545    #[test]
1546    fn test_string_hex_escape() {
1547        // Test \xNN hex escape
1548        let mut lexer = Lexer::new(r#""hello\x41world""#);
1549        match lexer.next_token() {
1550            Some((Token::StringLit(s), _)) => assert_eq!(s, "helloAworld"),
1551            other => panic!("Expected StringLit, got {:?}", other),
1552        }
1553    }
1554
1555    #[test]
1556    fn test_string_unicode_escape() {
1557        // Test \u{NNNN} Unicode escape
1558        let mut lexer = Lexer::new(r#""hello\u{1F600}world""#);
1559        match lexer.next_token() {
1560            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello😀world"),
1561            other => panic!("Expected StringLit, got {:?}", other),
1562        }
1563
1564        // Test Greek letter
1565        let mut lexer = Lexer::new(r#""\u{03C4}""#);
1566        match lexer.next_token() {
1567            Some((Token::StringLit(s), _)) => assert_eq!(s, "τ"),
1568            other => panic!("Expected StringLit, got {:?}", other),
1569        }
1570    }
1571
1572    #[test]
1573    fn test_char_escape_sequences() {
1574        let mut lexer = Lexer::new(r"'\n'");
1575        match lexer.next_token() {
1576            Some((Token::CharLit(c), _)) => assert_eq!(c, '\n'),
1577            other => panic!("Expected CharLit, got {:?}", other),
1578        }
1579
1580        let mut lexer = Lexer::new(r"'\t'");
1581        match lexer.next_token() {
1582            Some((Token::CharLit(c), _)) => assert_eq!(c, '\t'),
1583            other => panic!("Expected CharLit, got {:?}", other),
1584        }
1585
1586        let mut lexer = Lexer::new(r"'\\'");
1587        match lexer.next_token() {
1588            Some((Token::CharLit(c), _)) => assert_eq!(c, '\\'),
1589            other => panic!("Expected CharLit, got {:?}", other),
1590        }
1591    }
1592
1593    #[test]
1594    fn test_raw_string() {
1595        // Raw strings should NOT process escapes
1596        let mut lexer = Lexer::new(r#"r"hello\nworld""#);
1597        match lexer.next_token() {
1598            Some((Token::RawStringLit(s), _)) => assert_eq!(s, r"hello\nworld"),
1599            other => panic!("Expected RawStringLit, got {:?}", other),
1600        }
1601    }
1602
1603    #[test]
1604    fn test_raw_string_delimited() {
1605        // r#"..."# style
1606        let mut lexer = Lexer::new(r##"r#"hello "world""#"##);
1607        match lexer.next_token() {
1608            Some((Token::RawStringDelimited(s), _)) => assert_eq!(s, r#"hello "world""#),
1609            other => panic!("Expected RawStringDelimited, got {:?}", other),
1610        }
1611    }
1612
1613    #[test]
1614    fn test_byte_string() {
1615        let mut lexer = Lexer::new(r#"b"hello""#);
1616        match lexer.next_token() {
1617            Some((Token::ByteStringLit(bytes), _)) => {
1618                assert_eq!(bytes, vec![104, 101, 108, 108, 111]); // "hello" in ASCII
1619            }
1620            other => panic!("Expected ByteStringLit, got {:?}", other),
1621        }
1622    }
1623
1624    #[test]
1625    fn test_interpolated_string() {
1626        let mut lexer = Lexer::new(r#"f"hello {name}""#);
1627        match lexer.next_token() {
1628            Some((Token::InterpolatedStringLit(s), _)) => assert_eq!(s, "hello {name}"),
1629            other => panic!("Expected InterpolatedStringLit, got {:?}", other),
1630        }
1631    }
1632
1633    #[test]
1634    fn test_sigil_string_sql() {
1635        let mut lexer = Lexer::new(r#"σ"SELECT * FROM {table}""#);
1636        match lexer.next_token() {
1637            Some((Token::SigilStringSql(s), _)) => assert_eq!(s, "SELECT * FROM {table}"),
1638            other => panic!("Expected SigilStringSql, got {:?}", other),
1639        }
1640    }
1641
1642    #[test]
1643    fn test_sigil_string_route() {
1644        let mut lexer = Lexer::new(r#"ρ"/api/v1/{resource}/{id}""#);
1645        match lexer.next_token() {
1646            Some((Token::SigilStringRoute(s), _)) => assert_eq!(s, "/api/v1/{resource}/{id}"),
1647            other => panic!("Expected SigilStringRoute, got {:?}", other),
1648        }
1649    }
1650
1651    #[test]
1652    fn test_unicode_in_strings() {
1653        // Test direct Unicode in strings
1654        let mut lexer = Lexer::new(r#""τφσρ 你好 🦀""#);
1655        match lexer.next_token() {
1656            Some((Token::StringLit(s), _)) => assert_eq!(s, "τφσρ 你好 🦀"),
1657            other => panic!("Expected StringLit, got {:?}", other),
1658        }
1659    }
1660
1661    #[test]
1662    fn test_empty_string() {
1663        let mut lexer = Lexer::new(r#""""#);
1664        match lexer.next_token() {
1665            Some((Token::StringLit(s), _)) => assert_eq!(s, ""),
1666            other => panic!("Expected empty StringLit, got {:?}", other),
1667        }
1668    }
1669
1670    #[test]
1671    fn test_escape_sequence_helper() {
1672        // Unit test the helper function directly
1673        assert_eq!(process_escape_sequences(r"hello\nworld"), "hello\nworld");
1674        assert_eq!(process_escape_sequences(r"hello\tworld"), "hello\tworld");
1675        assert_eq!(process_escape_sequences(r"hello\\world"), "hello\\world");
1676        assert_eq!(process_escape_sequences(r#"hello\"world"#), "hello\"world");
1677        assert_eq!(process_escape_sequences(r"hello\x41world"), "helloAworld");
1678        assert_eq!(
1679            process_escape_sequences(r"hello\u{1F600}world"),
1680            "hello😀world"
1681        );
1682    }
1683}
sigil_parser/lexer.rs

sigil_parser/
lexer.rs