sigil_parser/
lexer.rs

1//! Lexer for the Sigil programming language.
2//!
3//! Handles polysynthetic morphemes, evidentiality markers, and multi-base numerals.
4
5use crate::span::Span;
6use logos::Logos;
7
8/// Process escape sequences in a string literal.
9/// Converts \n, \t, \r, \\, \", \', \0, \xNN, \u{NNNN} to their actual characters.
10/// Also handles line continuation: \<newline><whitespace> is stripped entirely.
11fn process_escape_sequences(s: &str) -> String {
12    let mut result = String::with_capacity(s.len());
13    let mut chars = s.chars().peekable();
14
15    while let Some(c) = chars.next() {
16        if c == '\\' {
17            match chars.next() {
18                Some('\n') => {
19                    // Line continuation: skip newline and any leading whitespace
20                    while let Some(&c) = chars.peek() {
21                        if c == ' ' || c == '\t' {
22                            chars.next();
23                        } else {
24                            break;
25                        }
26                    }
27                }
28                Some('n') => result.push('\n'),
29                Some('t') => result.push('\t'),
30                Some('r') => result.push('\r'),
31                Some('\\') => result.push('\\'),
32                Some('"') => result.push('"'),
33                Some('\'') => result.push('\''),
34                Some('0') => result.push('\0'),
35                Some('x') => {
36                    // \xNN - two hex digits
37                    let mut hex = String::new();
38                    for _ in 0..2 {
39                        if let Some(&c) = chars.peek() {
40                            if c.is_ascii_hexdigit() {
41                                hex.push(chars.next().unwrap());
42                            }
43                        }
44                    }
45                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
46                        result.push(val as char);
47                    }
48                }
49                Some('u') => {
50                    // \u{NNNN} - Unicode code point
51                    if chars.peek() == Some(&'{') {
52                        chars.next(); // consume '{'
53                        let mut hex = String::new();
54                        while let Some(&c) = chars.peek() {
55                            if c == '}' {
56                                chars.next();
57                                break;
58                            }
59                            if c.is_ascii_hexdigit() {
60                                hex.push(chars.next().unwrap());
61                            } else {
62                                break;
63                            }
64                        }
65                        if let Ok(val) = u32::from_str_radix(&hex, 16) {
66                            if let Some(c) = char::from_u32(val) {
67                                result.push(c);
68                            }
69                        }
70                    }
71                }
72                Some(other) => {
73                    // Unknown escape, keep as-is
74                    result.push('\\');
75                    result.push(other);
76                }
77                None => result.push('\\'),
78            }
79        } else {
80            result.push(c);
81        }
82    }
83    result
84}
85
86/// Process escape sequences in byte string literals, returning bytes.
87fn process_byte_escape_sequences(s: &str) -> Vec<u8> {
88    let mut result = Vec::with_capacity(s.len());
89    let mut chars = s.chars().peekable();
90
91    while let Some(c) = chars.next() {
92        if c == '\\' {
93            match chars.next() {
94                Some('\n') => {
95                    // Line continuation: skip newline and any leading whitespace
96                    while let Some(&c) = chars.peek() {
97                        if c == ' ' || c == '\t' {
98                            chars.next();
99                        } else {
100                            break;
101                        }
102                    }
103                }
104                Some('n') => result.push(b'\n'),
105                Some('t') => result.push(b'\t'),
106                Some('r') => result.push(b'\r'),
107                Some('\\') => result.push(b'\\'),
108                Some('"') => result.push(b'"'),
109                Some('\'') => result.push(b'\''),
110                Some('0') => result.push(0),
111                Some('x') => {
112                    // \xNN - two hex digits
113                    let mut hex = String::new();
114                    for _ in 0..2 {
115                        if let Some(&c) = chars.peek() {
116                            if c.is_ascii_hexdigit() {
117                                hex.push(chars.next().unwrap());
118                            }
119                        }
120                    }
121                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
122                        result.push(val);
123                    }
124                }
125                Some(other) => {
126                    // Unknown escape, keep as-is
127                    result.push(b'\\');
128                    if other.is_ascii() {
129                        result.push(other as u8);
130                    }
131                }
132                None => result.push(b'\\'),
133            }
134        } else if c.is_ascii() {
135            result.push(c as u8);
136        }
137        // Non-ASCII in byte strings is ignored (Rust doesn't allow it)
138    }
139    result
140}
141
142/// Callback for delimited raw strings (r#"..."#).
143/// Reads until the closing "# is found.
144/// Callback for block comments: /* ... */
145/// Consumes characters until */ is found
146fn block_comment_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
147    let remainder = lex.remainder();
148
149    // Find the closing */
150    if let Some(end_pos) = remainder.find("*/") {
151        let content = &remainder[..end_pos];
152        // Bump past content and closing */ (2 chars)
153        lex.bump(end_pos + 2);
154        Some(content.to_string())
155    } else {
156        // No closing */ found - consume rest as comment
157        let len = remainder.len();
158        lex.bump(len);
159        Some(remainder.to_string())
160    }
161}
162
163fn raw_string_delimited_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
164    let remainder = lex.remainder();
165
166    // Find the closing "#
167    if let Some(end_pos) = remainder.find("\"#") {
168        let content = &remainder[..end_pos];
169        // Bump past content and closing "# (2 chars)
170        lex.bump(end_pos + 2);
171        Some(content.to_string())
172    } else {
173        None
174    }
175}
176
177/// Callback for multi-line string literals.
178/// Reads from """ until the next """ is found.
179fn multiline_string_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
180    let remainder = lex.remainder();
181
182    // Find the closing """
183    if let Some(end_pos) = remainder.find("\"\"\"") {
184        let content = &remainder[..end_pos];
185        // Bump the lexer past the content and closing quotes
186        lex.bump(end_pos + 3);
187        Some(process_escape_sequences(content))
188    } else {
189        // No closing """ found - skip to end and return what we have
190        None
191    }
192}
193
194/// Process escape sequences in a character literal.
195fn process_char_escape(s: &str) -> char {
196    let mut chars = s.chars();
197    match chars.next() {
198        Some('\\') => match chars.next() {
199            Some('n') => '\n',
200            Some('t') => '\t',
201            Some('r') => '\r',
202            Some('\\') => '\\',
203            Some('"') => '"',
204            Some('\'') => '\'',
205            Some('0') => '\0',
206            Some('x') => {
207                let hex: String = chars.take(2).collect();
208                u8::from_str_radix(&hex, 16)
209                    .map(|v| v as char)
210                    .unwrap_or('?')
211            }
212            Some('u') => {
213                if chars.next() == Some('{') {
214                    let hex: String = chars.take_while(|&c| c != '}').collect();
215                    u32::from_str_radix(&hex, 16)
216                        .ok()
217                        .and_then(char::from_u32)
218                        .unwrap_or('?')
219                } else {
220                    '?'
221                }
222            }
223            Some(c) => c,
224            None => '?',
225        },
226        Some(c) => c,
227        None => '?',
228    }
229}
230
231/// Process escape sequences in a byte character literal (b'x').
232fn process_byte_char_escape(s: &str) -> u8 {
233    let mut chars = s.chars();
234    match chars.next() {
235        Some('\\') => match chars.next() {
236            Some('n') => b'\n',
237            Some('t') => b'\t',
238            Some('r') => b'\r',
239            Some('\\') => b'\\',
240            Some('"') => b'"',
241            Some('\'') => b'\'',
242            Some('0') => b'\0',
243            Some('x') => {
244                let hex: String = chars.take(2).collect();
245                u8::from_str_radix(&hex, 16).unwrap_or(b'?')
246            }
247            Some(c) => c as u8,
248            None => b'?',
249        },
250        Some(c) => c as u8,
251        None => b'?',
252    }
253}
254
255/// Token types for Sigil.
256#[derive(Logos, Debug, Clone, PartialEq)]
257#[logos(skip r"[ \t\r\n\f]+")]
258pub enum Token {
259    // === Comments ===
260    #[regex(r"//[^\n]*", |lex| lex.slice().to_string())]
261    LineComment(String),
262
263    #[regex(r"//![^\n]*", |lex| lex.slice().to_string())]
264    DocComment(String),
265
266    // Tilde comment style: ~~ ... ~~
267    #[regex(r"~~[^\n]*", |lex| lex.slice().to_string())]
268    TildeComment(String),
269
270    // Block comment: /* ... */ (non-nested)
271    #[token("/*", block_comment_callback)]
272    BlockComment(String),
273
274    // === Keywords (Sigil-native only - Rust purged) ===
275    // Note: λ/Λ handled by Token::Lambda - parser is context-aware
276    #[token("rite")] // rite (ritual/spell) for function
277    Fn,
278    #[token("async")]
279    Async,
280    #[token("≔")] // definition operator
281    Let,
282    // Note: ∆ handled by Token::Delta - parser is context-aware
283    #[token("vary")] // vary for mutable
284    Mut,
285    #[token("◆")] // diamond for const
286    Const,
287    #[token("linear")]
288    Linear,
289    #[token("type")]
290    Type,
291    // Note: Σ handled by Token::Sigma - parser is context-aware
292    #[token("sigil")] // sigil for struct
293    Struct,
294    #[token("ᛈ")] // perthro rune for enum
295    Enum,
296    // Note: Θ handled by Token::Theta - parser is context-aware
297    #[token("aspect")] // aspect for trait
298    Trait,
299    #[token("⊢")] // turnstile for impl
300    Impl,
301    #[token("scroll")] // scroll for module
302    Mod,
303    #[token("invoke")] // invoke for use/import
304    Use,
305    #[token("☉")] // sun for public
306    Pub,
307    #[token("actor")]
308    Actor,
309    #[token("saga")]
310    Saga,
311    #[token("scope")]
312    Scope,
313    #[token("rune")]
314    Rune,
315    #[token("macro")]
316    Macro,
317    #[token("macro_rules")]
318    MacroRules,
319
320    // Control flow (Sigil-native only)
321    #[token("⎇")] // ISO branch symbol for if
322    If,
323    #[token("⎉")] // ISO alternative symbol for else
324    Else,
325    #[token("⌥")] // option key symbol for match
326    Match,
327    // Note: ∞ handled by Token::Infinity - parser is context-aware
328    #[token("forever")] // forever for infinite loop
329    Loop,
330    #[token("⟳")] // cycle arrow for while
331    While,
332    // Note: ∀ handled by Token::ForAll - parser is context-aware
333    #[token("each")] // each for iteration
334    For,
335    // Note: ∈ handled by Token::ElementOf - parser is context-aware
336    #[token("of")] // of for membership
337    In,
338    #[token("⊲")] // left triangle for break
339    Break,
340    #[token("⊳")] // right triangle for continue
341    Continue,
342    #[token("⤺")] // return arrow
343    Return,
344    #[token("yield")]
345    Yield,
346    #[token("await")]
347    Await,
348
349    // Other keywords (Sigil-native only)
350    // Note: ξ/Ξ handled by Token::Xi - parser is context-aware
351    #[token("this")] // this for self reference
352    SelfLower,
353    #[token("This")] // This for Self type
354    SelfUpper,
355    // Note: ↑ handled by Token::IntensityUp - parser is context-aware
356    #[token("above")] // above for super/parent
357    Super,
358    #[token("tome")] // tome for crate
359    Crate,
360    #[token("∋")] // such that for where clauses
361    Where,
362    #[token("as")] // type casting - no better symbolic alternative
363    As,
364    #[token("dyn")]
365    Dyn,
366    #[token("move")]
367    Move,
368    #[token("ref")]
369    Ref,
370    #[token("static")]
371    Static,
372    #[token("unsafe")]
373    Unsafe,
374    #[token("extern")]
375    Extern,
376    #[token("asm")]
377    Asm,
378    #[token("volatile")]
379    Volatile,
380    #[token("naked")]
381    Naked,
382    #[token("packed")]
383    Packed,
384    #[token("simd")]
385    Simd,
386    #[token("atomic")]
387    Atomic,
388    #[token("derive")]
389    Derive,
390    #[token("on")]
391    On,
392
393    // Plurality keywords (DAEMONIORUM extensions)
394    #[token("alter")]
395    Alter,
396    #[token("switch")]
397    Switch,
398    #[token("headspace")]
399    Headspace,
400    #[token("cocon")]
401    CoCon,
402    #[token("reality")]
403    Reality,
404    #[token("split")]
405    Split,
406    #[token("trigger")]
407    Trigger,
408    #[token("layer")]
409    Layer,
410    #[token("location")]
411    Location,
412    #[token("states")]
413    States,
414    #[token("anima")]
415    Anima,
416    #[token("to")]
417    To,
418    #[token("from")]
419    From,
420
421    // Alter-source markers (compound tokens)
422    #[token("@!")]
423    AlterSourceFronting,
424    #[token("@~")]
425    AlterSourceCoCon,
426    #[token("@?")]
427    AlterSourceDormant,
428    #[token("@‽")]
429    AlterSourceBlended,
430
431    // Boolean literals (Sigil-native only)
432    // Note: ⊤/⊥ handled by Token::Top/Bottom - parser is context-aware
433    #[token("yea")] // yea for true
434    True,
435    #[token("nay")] // nay for false
436    False,
437
438    // Null literal
439    #[token("null")]
440    Null,
441
442    // === Morphemes (Greek letters) ===
443    #[token("τ")]
444    #[token("Τ")]
445    Tau, // Transform/map
446
447    #[token("φ")]
448    #[token("Φ")]
449    Phi, // Filter
450
451    #[token("σ")]
452    #[token("Σ")]
453    Sigma, // Sort/Sum - also struct in declaration context
454
455    #[token("ρ")]
456    #[token("Ρ")]
457    Rho, // Reduce
458
459    #[token("λ")]
460    #[token("Λ")]
461    Lambda, // Lambda - also fn in declaration context
462
463    #[token("Π")]
464    Pi, // Product
465
466    #[token("⌛")]
467    Hourglass, // Await symbol
468
469    // Additional morphemes
470    #[token("δ")]
471    #[token("Δ")]
472    Delta, // Difference/change
473
474    #[token("ε")]
475    Epsilon, // Empty/null
476
477    #[token("ω")]
478    #[token("Ω")]
479    Omega, // End/terminal
480
481    #[token("α")]
482    Alpha, // First element
483
484    #[token("ζ")]
485    Zeta, // Zip/combine
486
487    // === Additional Access Morphemes ===
488    #[token("μ")]
489    #[token("Μ")]
490    Mu, // Middle/median element
491
492    #[token("χ")]
493    #[token("Χ")]
494    Chi, // Random/choice (from chaos)
495
496    #[token("ν")]
497    #[token("Ν")]
498    Nu, // Nth element (ordinal)
499
500    #[token("ξ")]
501    #[token("Ξ")]
502    Xi, // Next in sequence
503
504    #[token("ψ")]
505    #[token("Ψ")]
506    Psi, // Psychological/mental state
507
508    #[token("θ")]
509    #[token("Θ")]
510    Theta, // Threshold/angle
511
512    #[token("κ")]
513    #[token("Κ")]
514    Kappa, // Callback/continuation
515
516    // === Parallel/Concurrency Morphemes ===
517    #[token("∥")]
518    #[token("parallel")]
519    Parallel, // Parallel execution (U+2225)
520
521    #[token("⊛")]
522    #[token("gpu")]
523    Gpu, // GPU compute shader (U+229B - circled asterisk)
524
525    // === Quantifiers (for AI-native set operations) ===
526    #[token("∀")]
527    ForAll, // Universal quantification
528
529    #[token("∃")]
530    Exists, // Existential quantification
531
532    #[token("∈")]
533    ElementOf, // Membership test
534
535    #[token("∉")]
536    NotElementOf, // Non-membership
537
538    // === Set Operations ===
539    #[token("∪")]
540    Union, // Set union
541
542    #[token("∩")]
543    Intersection, // Set intersection
544
545    #[token("∖")]
546    SetMinus, // Set difference
547
548    #[token("⊂")]
549    Subset, // Proper subset
550
551    #[token("⊆")]
552    SubsetEq, // Subset or equal
553
554    #[token("⊃")]
555    Superset, // Proper superset
556
557    #[token("⊇")]
558    SupersetEq, // Superset or equal
559
560    // === Logic Operators ===
561    #[token("∧")]
562    LogicAnd, // Logical conjunction
563
564    #[token("∨")]
565    LogicOr, // Logical disjunction
566
567    #[token("¬")]
568    LogicNot, // Logical negation
569
570    #[token("⊻")]
571    LogicXor, // Exclusive or
572
573    #[token("⊤")]
574    Top, // True/any type
575
576    #[token("⊥")]
577    Bottom, // False/never type
578
579    // === Bitwise Operators (Unicode) ===
580    #[token("⋏")]
581    BitwiseAndSymbol, // Bitwise AND (U+22CF)
582
583    #[token("⋎")]
584    BitwiseOrSymbol, // Bitwise OR (U+22CE)
585
586    #[token("⊙")]
587    CircledDot, // Hadamard product / element-wise multiply (U+2299)
588
589    // Note: ⊗ (tensor product) is already defined as Token::Tensor below
590
591    // === Type Theory ===
592    #[token("∷")]
593    TypeAnnotation, // Type annotation (alternative to :)
594
595    // === Analysis/Calculus ===
596    #[token("∫")]
597    Integral, // Cumulative sum
598
599    #[token("∂")]
600    Partial, // Discrete derivative
601
602    #[token("√")]
603    Sqrt, // Square root
604
605    #[token("∛")]
606    Cbrt, // Cube root
607
608    #[token("∇")]
609    Nabla, // Gradient (U+2207)
610
611    // === APL-Inspired Symbols ===
612    #[token("⍋")]
613    GradeUp, // Sort ascending (U+234B)
614
615    #[token("⍒")]
616    GradeDown, // Sort descending (U+2352)
617
618    #[token("⌽")]
619    Rotate, // Reverse/rotate (U+233D)
620
621    #[token("↻")]
622    CycleArrow, // Cycle/repeat (U+21BB)
623
624    #[token("⌺")]
625    QuadDiamond, // Windows/stencil (U+233A)
626
627    #[token("⊞")]
628    SquaredPlus, // Chunks (U+229E)
629
630    #[token("⍳")]
631    Iota, // Enumerate/index (U+2373)
632
633    // === Category Theory ===
634    #[token("∘")]
635    Compose, // Function composition
636
637    #[token("⊗")]
638    Tensor, // Tensor product
639
640    #[token("⊕")]
641    DirectSum, // Direct sum / XOR
642
643    // === Data Operations ===
644    #[token("⋈")]
645    Bowtie, // Join/zip combining (U+22C8)
646
647    #[token("⋳")]
648    ElementSmallVerticalBar, // Flatten (U+22F3)
649
650    #[token("⊔")]
651    SquareCup, // Lattice join / supremum (U+2294)
652
653    #[token("⊓")]
654    SquareCap, // Lattice meet / infimum (U+2293)
655
656    // === Evidentiality Markers ===
657    // Note: These are handled contextually since ! and ? have other uses
658    #[token("‽")]
659    Interrobang, // Paradox/trust boundary (U+203D)
660
661    #[token("◊")]
662    Lozenge, // Predicted/speculative (U+25CA) - Token◊
663
664    #[token("□")]
665    BoxSymbol, // Necessity/verification (U+25A1) - holographic necessity operator
666
667    // === Legion Morphemes (Holographic Agent Collective) ===
668    // From Infernum 2.0 - distributed memory and multi-agent coordination
669    #[token("∿")]
670    #[token("legion_field")]
671    LegionField, // Collective memory substrate (U+223F sine wave) - memory∿
672
673    #[token("⫰")]
674    #[token("interfere")]
675    Interfere, // Interference query (U+2AF0) - query ⫰ field∿
676
677    #[token("⟁")]
678    #[token("distribute")]
679    Distribute, // Holographic distribution (U+27C1) - task ⟁ 8
680
681    #[token("⟀")]
682    #[token("gather")]
683    Gather, // Interference gathering (U+27C0) - fragments ⟀
684
685    #[token("↠")]
686    #[token("broadcast")]
687    Broadcast, // One-to-many broadcast (U+21A0) - signal ↠ legion
688
689    #[token("⇢")]
690    #[token("consensus")]
691    Consensus, // Many-to-one consensus (U+21E2) - contributions ⇢
692
693    // Compound Legion operators
694    #[token("⊕=")]
695    DirectSumEq, // Superposition assign - field∿ ⊕= pattern
696
697    #[token("∂=")]
698    PartialEq_, // Decay assign - field∿ ∂= 0.95 (renamed to avoid std conflict)
699
700    #[token("⫰=")]
701    InterfereEq, // Interference assign
702
703    // === Affective Markers (Sentiment & Emotion) ===
704    // Sentiment polarity
705    #[token("⊖")]
706    AffectNegative, // Negative sentiment (U+2296 Circled Minus)
707
708    #[token("⊜")]
709    AffectNeutral, // Neutral sentiment (U+229C Circled Equals)
710
711    // Note: ⊕ (U+2295) is already DirectSum - we'll use it dual-purpose for positive sentiment
712
713    // Sarcasm/Irony
714    #[token("⸮")]
715    IronyMark, // Irony/sarcasm marker (U+2E2E - historical percontation point!)
716
717    // Intensity modifiers
718    #[token("↑")]
719    IntensityUp, // Intensifier (U+2191)
720
721    #[token("↓")]
722    IntensityDown, // Dampener (U+2193)
723
724    #[token("⇈")]
725    IntensityMax, // Maximum intensity (U+21C8)
726
727    // Formality register
728    #[token("♔")]
729    FormalRegister, // Formal (U+2654 White King)
730
731    #[token("♟")]
732    InformalRegister, // Informal (U+265F Black Pawn)
733
734    // Emotion markers (Plutchik's wheel)
735    #[token("☺")]
736    EmotionJoy, // Joy (U+263A)
737
738    #[token("☹")]
739    EmotionSadness, // Sadness (U+2639)
740
741    #[token("⚡")]
742    EmotionAnger, // Anger (U+26A1)
743
744    #[token("❄")]
745    EmotionFear, // Fear (U+2744)
746
747    #[token("✦")]
748    EmotionSurprise, // Surprise (U+2726)
749
750    #[token("♡")]
751    EmotionLove, // Love/Trust (U+2661)
752
753    // Confidence markers
754    #[token("◉")]
755    ConfidenceHigh, // High confidence (U+25C9)
756
757    #[token("◎")]
758    ConfidenceMedium, // Medium confidence (U+25CE)
759
760    #[token("○")]
761    ConfidenceLow, // Low confidence (U+25CB)
762
763    // === Aspect Morphemes (verb aspects) ===
764    #[token("·ing")]
765    AspectProgressive, // Ongoing/streaming aspect
766
767    #[token("·ed")]
768    AspectPerfective, // Completed aspect
769
770    #[token("·able")]
771    AspectPotential, // Capability aspect
772
773    #[token("·ive")]
774    AspectResultative, // Result-producing aspect
775
776    // === Operators ===
777    #[token("|")]
778    Pipe,
779    #[token("·")] // middle dot - Sigil path separator (Rust :: purged)
780    MiddleDot,
781    #[token("→")] // rightwards arrow (Rust -> purged)
782    Arrow,
783    #[token("=>")]
784    FatArrow,
785    #[token("<-")]
786    LeftArrow,
787    #[token("==")]
788    EqEq,
789    #[token("!=")]
790    NotEq,
791    #[token("<=")]
792    LtEq,
793    #[token(">=")]
794    GtEq,
795    #[token("<")]
796    Lt,
797    #[token(">")]
798    Gt,
799    #[token("+")]
800    Plus,
801    #[token("-")]
802    Minus,
803    #[token("*")]
804    Star,
805    #[token("/")]
806    Slash,
807    #[token("%")]
808    Percent,
809    #[token("**")]
810    StarStar, // Exponentiation
811    // Note: ∧/∨ handled by Token::LogicAnd/LogicOr - parser is context-aware
812    // AndAnd and OrOr tokens kept for compatibility but won't lex anything
813    AndAnd,
814    OrOr,
815    #[token("!")]
816    Bang, // Evidentiality: known / logical not
817    #[token("?")]
818    Question, // Evidentiality: uncertain / try
819    #[token("~")]
820    Tilde, // Evidentiality: reported
821    #[token("&")]
822    Amp,
823    #[token("^")]
824    Caret,
825    #[token("<<=")]
826    ShlEq,
827    #[token(">>=")]
828    ShrEq,
829    #[token("<<")]
830    Shl,
831    #[token(">>")]
832    Shr,
833    #[token("=")]
834    Eq,
835    #[token("+=")]
836    PlusEq,
837    #[token("-=")]
838    MinusEq,
839    #[token("*=")]
840    StarEq,
841    #[token("/=")]
842    SlashEq,
843    #[token("%=")]
844    PercentEq,
845    #[token("|=")]
846    PipeEq,
847    #[token("&=")]
848    AmpEq,
849    #[token("^=")]
850    CaretEq,
851    #[token("..")]
852    DotDot,
853    #[token("..=")]
854    DotDotEq,
855    #[token("++")]
856    PlusPlus, // Concatenation
857    // ColonColon now uses · (MiddleDot handles this - Rust :: purged)
858    ColonColon,
859    #[token(":")]
860    Colon,
861    #[token(";")]
862    Semi,
863    #[token(",")]
864    Comma,
865    #[token(".")]
866    Dot,
867    #[token("@")]
868    At,
869    #[token("#!")]
870    HashBang, // Inner attribute prefix #![...]
871    #[token("#")]
872    Hash,
873    #[token("_", priority = 3)]
874    Underscore,
875
876    // === Delimiters ===
877    #[token("(")]
878    LParen,
879    #[token(")")]
880    RParen,
881    #[token("{")]
882    LBrace,
883    #[token("}")]
884    RBrace,
885    #[token("[")]
886    LBracket,
887    #[token("]")]
888    RBracket,
889
890    // === Special symbols ===
891    #[token("∅")]
892    Empty, // Void/emptiness (śūnya)
893    #[token("◯")]
894    Circle, // Geometric zero
895    #[token("∞")]
896    Infinity, // Ananta
897
898    // === Protocol Operations (Sigil-native networking) ===
899    #[token("⇒")]
900    ProtoSend, // Send data (U+21D2 - rightwards double arrow)
901
902    #[token("⇐")]
903    ProtoRecv, // Receive data (U+21D0 - leftwards double arrow)
904
905    #[token("≋")]
906    ProtoStream, // Stream data (U+224B - triple tilde)
907
908    #[token("⊸")]
909    ProtoConnect, // Connect/lollipop (U+22B8 - multimap)
910
911    #[token("⏱")]
912    ProtoTimeout, // Timeout (U+23F1 - stopwatch)
913
914    // Note: ⊗ (Tensor) is used for close in protocol contexts
915
916    // Protocol keywords for ASCII fallback
917    #[token("send")]
918    Send,
919    #[token("recv")]
920    Recv,
921    #[token("stream")]
922    Stream,
923    #[token("connect")]
924    Connect,
925    #[token("close")]
926    Close,
927    #[token("timeout")]
928    Timeout,
929    #[token("retry")]
930    Retry,
931    #[token("header")]
932    Header,
933    #[token("body")]
934    Body,
935
936    // Protocol type identifiers (for incorporation: http·, ws·, grpc·, kafka·)
937    #[token("http")]
938    Http,
939    #[token("https")]
940    Https,
941    #[token("ws")]
942    Ws,
943    #[token("wss")]
944    Wss,
945    #[token("grpc")]
946    Grpc,
947    #[token("kafka")]
948    Kafka,
949    #[token("amqp")]
950    Amqp,
951    #[token("graphql")]
952    GraphQL,
953
954    // === Numbers ===
955    // Binary: 0b... with optional type suffix
956    #[regex(r"0b[01_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
957    BinaryLit(String),
958
959    // Octal: 0o... with optional type suffix
960    #[regex(r"0o[0-7_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
961    OctalLit(String),
962
963    // Hex: 0x... with optional type suffix
964    #[regex(r"0x[0-9a-fA-F_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
965    HexLit(String),
966
967    // Vigesimal: 0v... (base 20)
968    #[regex(r"0v[0-9a-jA-J_]+", |lex| lex.slice().to_string())]
969    VigesimalLit(String),
970
971    // Sexagesimal: 0s... (base 60)
972    #[regex(r"0s[0-9a-zA-Z_]+", |lex| lex.slice().to_string())]
973    SexagesimalLit(String),
974
975    // Duodecimal: 0z... (base 12)
976    #[regex(r"0z[0-9a-bA-B_]+", |lex| lex.slice().to_string())]
977    DuodecimalLit(String),
978
979    // Float: 123.456 or 1.23e10 or 1e-15 (with or without decimal point if exponent present)
980    // Optional type suffix: f16, f32, f64, f128
981    #[regex(r"([0-9][0-9_]*\.[0-9][0-9_]*([eE][+-]?[0-9_]+)?|[0-9][0-9_]*[eE][+-]?[0-9_]+)(f16|f32|f64|f128)?", |lex| lex.slice().to_string())]
982    FloatLit(String),
983
984    // Integer: 123 with optional type suffix (i8, i16, i32, i64, i128, isize, u8, u16, u32, u64, u128, usize)
985    #[regex(r"[0-9][0-9_]*(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
986    IntLit(String),
987
988    // === Strings ===
989    // Regular string with escape sequence processing
990    // Note: \\(.|\n) handles both regular escapes and line continuation (\ at end of line)
991    #[regex(r#""([^"\\]|\\(.|\n))*""#, |lex| {
992        let s = lex.slice();
993        let inner = &s[1..s.len()-1];
994        process_escape_sequences(inner)
995    })]
996    StringLit(String),
997
998    // Multi-line string (triple-quoted) - handled via callback
999    #[token(r#"""""#, multiline_string_callback)]
1000    MultiLineStringLit(String),
1001
1002    // Byte string literal
1003    #[regex(r#"b"([^"\\]|\\.)*""#, |lex| {
1004        let s = lex.slice();
1005        let inner = &s[2..s.len()-1];
1006        process_byte_escape_sequences(inner)
1007    })]
1008    ByteStringLit(Vec<u8>),
1009
1010    // Interpolated string (will be parsed further for expressions)
1011    #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
1012        let s = lex.slice();
1013        let inner = &s[2..s.len()-1];
1014        process_escape_sequences(inner)
1015    })]
1016    InterpolatedStringLit(String),
1017
1018    // Sigil string - SQL template (σ prefix)
1019    #[regex(r#"σ"([^"\\]|\\.)*""#, |lex| {
1020        let s = lex.slice();
1021        // Get byte index after the σ character (which is 2 bytes in UTF-8)
1022        let start = "σ".len() + 1; // σ + opening quote
1023        let inner = &s[start..s.len()-1];
1024        process_escape_sequences(inner)
1025    })]
1026    SigilStringSql(String),
1027
1028    // Sigil string - Route template (ρ prefix)
1029    #[regex(r#"ρ"([^"\\]|\\.)*""#, |lex| {
1030        let s = lex.slice();
1031        // Get byte index after the ρ character (which is 2 bytes in UTF-8)
1032        let start = "ρ".len() + 1; // ρ + opening quote
1033        let inner = &s[start..s.len()-1];
1034        process_escape_sequences(inner)
1035    })]
1036    SigilStringRoute(String),
1037
1038    // Char literal with escape sequence processing
1039    // Matches: single char, hex escape \xNN, unicode escape \u{N...}, or simple escape \c
1040    #[regex(r"'([^'\\]|\\x[0-9a-fA-F]{2}|\\u\{[0-9a-fA-F]{1,6}\}|\\.)'", |lex| {
1041        let s = lex.slice();
1042        let inner = &s[1..s.len()-1];
1043        process_char_escape(inner)
1044    })]
1045    CharLit(char),
1046
1047    // Byte char literal (b'x' or b'\n')
1048    #[regex(r"b'([^'\\]|\\x[0-9a-fA-F]{2}|\\.)'", |lex| {
1049        let s = lex.slice();
1050        // Extract the character between b' and '
1051        let inner = &s[2..s.len()-1];
1052        process_byte_char_escape(inner)
1053    })]
1054    ByteCharLit(u8),
1055
1056    // Raw string (no escape processing, but allows \" for literal quotes in patterns)
1057    #[regex(r#"r"([^"\\]|\\.)*""#, |lex| {
1058        let s = lex.slice();
1059        s[2..s.len()-1].to_string()
1060    })]
1061    RawStringLit(String),
1062
1063    // Raw string with delimiter (r#"..."# style) - handles internal quotes
1064    #[token(r##"r#""##, raw_string_delimited_callback)]
1065    RawStringDelimited(String),
1066
1067    // === Lifetime/Label (for loop labels like 'outer: loop { break 'outer }) ===
1068    #[regex(r"'[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice()[1..].to_string())]
1069    Lifetime(String),
1070
1071    // === Identifiers ===
1072    // Includes Greek letters for polysynthetic identifiers like compute_ψ_state
1073    // Greek letters (both cases): αΑ, βΒ, γΓ, δΔ, εΕ, ζΖ, ηΗ, θΘ, ιΙ, κΚ, λΛ, μΜ, νΝ, ξΞ, οΟ, πΠ, ρΡ, σΣ, τΤ, υΥ, φΦ, χΧ, ψΨ, ωΩ
1074    #[regex(r"[a-zA-Z_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ][a-zA-Z0-9_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]*", |lex| lex.slice().to_string())]
1075    Ident(String),
1076
1077    // === Rune annotation ===
1078    #[regex(r"//@\s*rune:\s*[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
1079    RuneAnnotation(String),
1080}
1081
1082impl Token {
1083    pub fn is_keyword(&self) -> bool {
1084        matches!(
1085            self,
1086            Token::Fn
1087                | Token::Async
1088                | Token::Let
1089                | Token::Mut
1090                | Token::Const
1091                | Token::Type
1092                | Token::Struct
1093                | Token::Enum
1094                | Token::Trait
1095                | Token::Impl
1096                | Token::Mod
1097                | Token::Use
1098                | Token::Pub
1099                | Token::Actor
1100                | Token::Saga
1101                | Token::Scope
1102                | Token::Rune
1103                | Token::If
1104                | Token::Else
1105                | Token::Match
1106                | Token::Loop
1107                | Token::While
1108                | Token::For
1109                | Token::In
1110                | Token::Break
1111                | Token::Continue
1112                | Token::Return
1113                | Token::Yield
1114                | Token::Await
1115        ) || self.is_plurality_keyword()
1116    }
1117
1118    pub fn is_plurality_keyword(&self) -> bool {
1119        matches!(
1120            self,
1121            Token::Alter
1122                | Token::Switch
1123                | Token::Headspace
1124                | Token::CoCon
1125                | Token::Reality
1126                | Token::Split
1127                | Token::Trigger
1128                | Token::Layer
1129                | Token::Location
1130                | Token::States
1131                | Token::Anima
1132                | Token::To
1133                | Token::From
1134        )
1135    }
1136
1137    pub fn is_alter_source(&self) -> bool {
1138        matches!(
1139            self,
1140            Token::AlterSourceFronting
1141                | Token::AlterSourceCoCon
1142                | Token::AlterSourceDormant
1143                | Token::AlterSourceBlended
1144        )
1145    }
1146
1147    pub fn is_morpheme(&self) -> bool {
1148        matches!(
1149            self,
1150            Token::Tau | Token::Phi | Token::Sigma | Token::Rho |
1151            Token::Lambda | Token::Pi | Token::Hourglass |
1152            Token::Delta | Token::Epsilon | Token::Omega | Token::Alpha | Token::Zeta |
1153            Token::Mu | Token::Chi | Token::Nu | Token::Xi |  // Access morphemes
1154            Token::Parallel | Token::Gpu |  // Concurrency morphemes
1155            Token::Integral | Token::Partial | Token::Sqrt | Token::Cbrt |
1156            Token::Compose
1157        )
1158    }
1159
1160    pub fn is_aspect(&self) -> bool {
1161        matches!(
1162            self,
1163            Token::AspectProgressive
1164                | Token::AspectPerfective
1165                | Token::AspectPotential
1166                | Token::AspectResultative
1167        )
1168    }
1169
1170    pub fn is_data_op(&self) -> bool {
1171        matches!(
1172            self,
1173            Token::Bowtie | Token::ElementSmallVerticalBar | Token::SquareCup | Token::SquareCap
1174        )
1175    }
1176
1177    pub fn is_bitwise_symbol(&self) -> bool {
1178        matches!(self, Token::BitwiseAndSymbol | Token::BitwiseOrSymbol)
1179    }
1180
1181    pub fn is_quantifier(&self) -> bool {
1182        matches!(
1183            self,
1184            Token::ForAll | Token::Exists | Token::ElementOf | Token::NotElementOf
1185        )
1186    }
1187
1188    pub fn is_set_op(&self) -> bool {
1189        matches!(
1190            self,
1191            Token::Union
1192                | Token::Intersection
1193                | Token::SetMinus
1194                | Token::Subset
1195                | Token::SubsetEq
1196                | Token::Superset
1197                | Token::SupersetEq
1198        )
1199    }
1200
1201    pub fn is_logic_op(&self) -> bool {
1202        matches!(
1203            self,
1204            Token::LogicAnd
1205                | Token::LogicOr
1206                | Token::LogicNot
1207                | Token::LogicXor
1208                | Token::Top
1209                | Token::Bottom
1210        )
1211    }
1212
1213    pub fn is_evidentiality(&self) -> bool {
1214        matches!(
1215            self,
1216            Token::Bang | Token::Question | Token::Tilde | Token::Interrobang | Token::Lozenge
1217        )
1218    }
1219
1220    pub fn is_legion_morpheme(&self) -> bool {
1221        matches!(
1222            self,
1223            Token::LegionField      // ∿ - collective memory
1224                | Token::DirectSum  // ⊕ - superposition
1225                | Token::Interfere  // ⫰ - interference
1226                | Token::ConfidenceHigh  // ◉ - resonance (dual-purpose)
1227                | Token::Distribute // ⟁ - holographic distribution
1228                | Token::Gather     // ⟀ - interference gathering
1229                | Token::Broadcast  // ↠ - one-to-many
1230                | Token::Consensus  // ⇢ - many-to-one
1231                | Token::Partial // ∂ - decay
1232        )
1233    }
1234
1235    pub fn is_legion_assign(&self) -> bool {
1236        matches!(
1237            self,
1238            Token::DirectSumEq | Token::PartialEq_ | Token::InterfereEq
1239        )
1240    }
1241
1242    pub fn is_affective(&self) -> bool {
1243        matches!(
1244            self,
1245            // Sentiment
1246            Token::DirectSum |  // ⊕ positive (dual-purpose with DirectSum)
1247            Token::AffectNegative |  // ⊖ negative
1248            Token::AffectNeutral |  // ⊜ neutral
1249            // Sarcasm
1250            Token::IronyMark |  // ⸮ irony/sarcasm
1251            // Intensity
1252            Token::IntensityUp |  // ↑
1253            Token::IntensityDown |  // ↓
1254            Token::IntensityMax |  // ⇈
1255            // Formality
1256            Token::FormalRegister |  // ♔
1257            Token::InformalRegister |  // ♟
1258            // Emotions
1259            Token::EmotionJoy |  // ☺
1260            Token::EmotionSadness |  // ☹
1261            Token::EmotionAnger |  // ⚡
1262            Token::EmotionFear |  // ❄
1263            Token::EmotionSurprise |  // ✦
1264            Token::EmotionLove |  // ♡
1265            // Confidence
1266            Token::ConfidenceHigh |  // ◉
1267            Token::ConfidenceMedium |  // ◎
1268            Token::ConfidenceLow // ○
1269        )
1270    }
1271
1272    pub fn is_sentiment(&self) -> bool {
1273        matches!(
1274            self,
1275            Token::DirectSum | Token::AffectNegative | Token::AffectNeutral
1276        )
1277    }
1278
1279    pub fn is_emotion(&self) -> bool {
1280        matches!(
1281            self,
1282            Token::EmotionJoy
1283                | Token::EmotionSadness
1284                | Token::EmotionAnger
1285                | Token::EmotionFear
1286                | Token::EmotionSurprise
1287                | Token::EmotionLove
1288        )
1289    }
1290
1291    pub fn is_intensity(&self) -> bool {
1292        matches!(
1293            self,
1294            Token::IntensityUp | Token::IntensityDown | Token::IntensityMax
1295        )
1296    }
1297}
1298
1299/// Lexer wrapping Logos for Sigil.
1300pub struct Lexer<'a> {
1301    inner: logos::Lexer<'a, Token>,
1302    /// Buffer for lookahead tokens (supports multi-token peek)
1303    buffer: Vec<Option<(Token, Span)>>,
1304}
1305
1306impl<'a> Lexer<'a> {
1307    pub fn new(source: &'a str) -> Self {
1308        Self {
1309            inner: Token::lexer(source),
1310            buffer: Vec::new(),
1311        }
1312    }
1313
1314    /// Read the next token from the underlying logos lexer
1315    fn read_next(&mut self) -> Option<(Token, Span)> {
1316        match self.inner.next() {
1317            Some(Ok(token)) => {
1318                let span = self.inner.span();
1319                Some((token, Span::new(span.start, span.end)))
1320            }
1321            Some(Err(_)) => {
1322                // Skip invalid tokens and try next
1323                self.read_next()
1324            }
1325            None => None,
1326        }
1327    }
1328
1329    pub fn next_token(&mut self) -> Option<(Token, Span)> {
1330        if !self.buffer.is_empty() {
1331            // Return from buffer (front = next token)
1332            // Each buffer element is Option<(Token, Span)> where None = EOF
1333            return self.buffer.remove(0);
1334        }
1335        self.read_next()
1336    }
1337
1338    pub fn peek(&mut self) -> Option<&(Token, Span)> {
1339        self.peek_n(0)
1340    }
1341
1342    /// Peek n tokens ahead (0 = next token, 1 = token after that, etc.)
1343    pub fn peek_n(&mut self, n: usize) -> Option<&(Token, Span)> {
1344        // Fill buffer up to position n
1345        while self.buffer.len() <= n {
1346            let token = self.read_next();
1347            self.buffer.push(token);
1348        }
1349        self.buffer.get(n).and_then(|opt| opt.as_ref())
1350    }
1351
1352    pub fn span(&self) -> Span {
1353        let span = self.inner.span();
1354        Span::new(span.start, span.end)
1355    }
1356}
1357
1358#[cfg(test)]
1359mod tests {
1360    use super::*;
1361
1362    #[test]
1363    fn test_morphemes() {
1364        let mut lexer = Lexer::new("τ φ σ ρ λ Σ Π ⌛");
1365        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1366        assert!(matches!(lexer.next_token(), Some((Token::Phi, _))));
1367        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1368        assert!(matches!(lexer.next_token(), Some((Token::Rho, _))));
1369        assert!(matches!(lexer.next_token(), Some((Token::Lambda, _))));
1370        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1371        assert!(matches!(lexer.next_token(), Some((Token::Pi, _))));
1372        assert!(matches!(lexer.next_token(), Some((Token::Hourglass, _))));
1373    }
1374
1375    #[test]
1376    fn test_evidentiality() {
1377        let mut lexer = Lexer::new("value! uncertain? reported~ paradox‽");
1378        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "value"));
1379        assert!(matches!(lexer.next_token(), Some((Token::Bang, _))));
1380        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "uncertain"));
1381        assert!(matches!(lexer.next_token(), Some((Token::Question, _))));
1382        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "reported"));
1383        assert!(matches!(lexer.next_token(), Some((Token::Tilde, _))));
1384        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "paradox"));
1385        assert!(matches!(lexer.next_token(), Some((Token::Interrobang, _))));
1386    }
1387
1388    #[test]
1389    fn test_pipe_chain() {
1390        let mut lexer = Lexer::new("data|τ{f}|φ{p}|σ");
1391        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "data"));
1392        assert!(matches!(lexer.next_token(), Some((Token::Pipe, _))));
1393        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1394        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1395        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1396        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1397    }
1398
1399    #[test]
1400    fn test_numbers() {
1401        let mut lexer = Lexer::new("42 0b1010 0o52 0x2A 0v22 0s42 3.14");
1402        assert!(matches!(lexer.next_token(), Some((Token::IntLit(s), _)) if s == "42"));
1403        assert!(matches!(lexer.next_token(), Some((Token::BinaryLit(s), _)) if s == "0b1010"));
1404        assert!(matches!(lexer.next_token(), Some((Token::OctalLit(s), _)) if s == "0o52"));
1405        assert!(matches!(lexer.next_token(), Some((Token::HexLit(s), _)) if s == "0x2A"));
1406        assert!(matches!(lexer.next_token(), Some((Token::VigesimalLit(s), _)) if s == "0v22"));
1407        assert!(matches!(lexer.next_token(), Some((Token::SexagesimalLit(s), _)) if s == "0s42"));
1408        assert!(matches!(lexer.next_token(), Some((Token::FloatLit(s), _)) if s == "3.14"));
1409    }
1410
1411    #[test]
1412    fn test_incorporation() {
1413        let mut lexer = Lexer::new("file·open·read");
1414        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "file"));
1415        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1416        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "open"));
1417        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1418        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "read"));
1419    }
1420
1421    #[test]
1422    fn test_special_symbols() {
1423        let mut lexer = Lexer::new("∅ ◯ ∞");
1424        assert!(matches!(lexer.next_token(), Some((Token::Empty, _))));
1425        assert!(matches!(lexer.next_token(), Some((Token::Circle, _))));
1426        assert!(matches!(lexer.next_token(), Some((Token::Infinity, _))));
1427    }
1428
1429    #[test]
1430    fn test_quantifiers() {
1431        let mut lexer = Lexer::new("∀x ∃y x∈S y∉T");
1432        assert!(matches!(lexer.next_token(), Some((Token::ForAll, _))));
1433        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1434        assert!(matches!(lexer.next_token(), Some((Token::Exists, _))));
1435        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "y"));
1436        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1437        assert!(matches!(lexer.next_token(), Some((Token::ElementOf, _))));
1438    }
1439
1440    #[test]
1441    fn test_set_operations() {
1442        let mut lexer = Lexer::new("A∪B A∩B A∖B A⊂B A⊆B");
1443        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1444        assert!(matches!(lexer.next_token(), Some((Token::Union, _))));
1445        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "B"));
1446        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1447        assert!(matches!(lexer.next_token(), Some((Token::Intersection, _))));
1448    }
1449
1450    #[test]
1451    fn test_logic_operators() {
1452        let mut lexer = Lexer::new("p∧q p∨q ¬p p⊻q ⊤ ⊥");
1453        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1454        assert!(matches!(lexer.next_token(), Some((Token::LogicAnd, _))));
1455        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "q"));
1456        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1457        assert!(matches!(lexer.next_token(), Some((Token::LogicOr, _))));
1458    }
1459
1460    #[test]
1461    fn test_analysis_operators() {
1462        let mut lexer = Lexer::new("∫f ∂g √x ∛y f∘g");
1463        assert!(matches!(lexer.next_token(), Some((Token::Integral, _))));
1464        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1465        assert!(matches!(lexer.next_token(), Some((Token::Partial, _))));
1466        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "g"));
1467        assert!(matches!(lexer.next_token(), Some((Token::Sqrt, _))));
1468    }
1469
1470    #[test]
1471    fn test_additional_morphemes() {
1472        let mut lexer = Lexer::new("δ ε ω α ζ");
1473        assert!(matches!(lexer.next_token(), Some((Token::Delta, _))));
1474        assert!(matches!(lexer.next_token(), Some((Token::Epsilon, _))));
1475        assert!(matches!(lexer.next_token(), Some((Token::Omega, _))));
1476        assert!(matches!(lexer.next_token(), Some((Token::Alpha, _))));
1477        assert!(matches!(lexer.next_token(), Some((Token::Zeta, _))));
1478    }
1479
1480    #[test]
1481    fn test_ffi_keywords() {
1482        let mut lexer = Lexer::new("extern unsafe");
1483        assert!(matches!(lexer.next_token(), Some((Token::Extern, _))));
1484        assert!(matches!(lexer.next_token(), Some((Token::Unsafe, _))));
1485    }
1486
1487    #[test]
1488    fn test_parallel_morphemes() {
1489        let mut lexer = Lexer::new("∥ parallel ⊛ gpu");
1490        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1491        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1492        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1493        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1494    }
1495
1496    #[test]
1497    fn test_lifetime_labels() {
1498        // Test loop labels
1499        let mut lexer = Lexer::new("'outer: loop { break 'outer }");
1500        assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1501        assert!(matches!(lexer.next_token(), Some((Token::Colon, _))));
1502        assert!(matches!(lexer.next_token(), Some((Token::Loop, _))));
1503        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1504        assert!(matches!(lexer.next_token(), Some((Token::Break, _))));
1505        assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1506        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1507    }
1508
1509    // ==================== STRING LITERAL TESTS ====================
1510
1511    #[test]
1512    fn test_string_escape_sequences() {
1513        // Test basic escape sequences
1514        let mut lexer = Lexer::new(r#""hello\nworld""#);
1515        match lexer.next_token() {
1516            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\nworld"),
1517            other => panic!("Expected StringLit, got {:?}", other),
1518        }
1519
1520        // Test tab escape
1521        let mut lexer = Lexer::new(r#""hello\tworld""#);
1522        match lexer.next_token() {
1523            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\tworld"),
1524            other => panic!("Expected StringLit, got {:?}", other),
1525        }
1526
1527        // Test carriage return
1528        let mut lexer = Lexer::new(r#""hello\rworld""#);
1529        match lexer.next_token() {
1530            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\rworld"),
1531            other => panic!("Expected StringLit, got {:?}", other),
1532        }
1533
1534        // Test escaped backslash
1535        let mut lexer = Lexer::new(r#""hello\\world""#);
1536        match lexer.next_token() {
1537            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\\world"),
1538            other => panic!("Expected StringLit, got {:?}", other),
1539        }
1540
1541        // Test escaped quote
1542        let mut lexer = Lexer::new(r#""hello\"world""#);
1543        match lexer.next_token() {
1544            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\"world"),
1545            other => panic!("Expected StringLit, got {:?}", other),
1546        }
1547
1548        // Test null character
1549        let mut lexer = Lexer::new(r#""hello\0world""#);
1550        match lexer.next_token() {
1551            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\0world"),
1552            other => panic!("Expected StringLit, got {:?}", other),
1553        }
1554    }
1555
1556    #[test]
1557    fn test_string_hex_escape() {
1558        // Test \xNN hex escape
1559        let mut lexer = Lexer::new(r#""hello\x41world""#);
1560        match lexer.next_token() {
1561            Some((Token::StringLit(s), _)) => assert_eq!(s, "helloAworld"),
1562            other => panic!("Expected StringLit, got {:?}", other),
1563        }
1564    }
1565
1566    #[test]
1567    fn test_string_unicode_escape() {
1568        // Test \u{NNNN} Unicode escape
1569        let mut lexer = Lexer::new(r#""hello\u{1F600}world""#);
1570        match lexer.next_token() {
1571            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello😀world"),
1572            other => panic!("Expected StringLit, got {:?}", other),
1573        }
1574
1575        // Test Greek letter
1576        let mut lexer = Lexer::new(r#""\u{03C4}""#);
1577        match lexer.next_token() {
1578            Some((Token::StringLit(s), _)) => assert_eq!(s, "τ"),
1579            other => panic!("Expected StringLit, got {:?}", other),
1580        }
1581    }
1582
1583    #[test]
1584    fn test_char_escape_sequences() {
1585        let mut lexer = Lexer::new(r"'\n'");
1586        match lexer.next_token() {
1587            Some((Token::CharLit(c), _)) => assert_eq!(c, '\n'),
1588            other => panic!("Expected CharLit, got {:?}", other),
1589        }
1590
1591        let mut lexer = Lexer::new(r"'\t'");
1592        match lexer.next_token() {
1593            Some((Token::CharLit(c), _)) => assert_eq!(c, '\t'),
1594            other => panic!("Expected CharLit, got {:?}", other),
1595        }
1596
1597        let mut lexer = Lexer::new(r"'\\'");
1598        match lexer.next_token() {
1599            Some((Token::CharLit(c), _)) => assert_eq!(c, '\\'),
1600            other => panic!("Expected CharLit, got {:?}", other),
1601        }
1602    }
1603
1604    #[test]
1605    fn test_raw_string() {
1606        // Raw strings should NOT process escapes
1607        let mut lexer = Lexer::new(r#"r"hello\nworld""#);
1608        match lexer.next_token() {
1609            Some((Token::RawStringLit(s), _)) => assert_eq!(s, r"hello\nworld"),
1610            other => panic!("Expected RawStringLit, got {:?}", other),
1611        }
1612    }
1613
1614    #[test]
1615    fn test_raw_string_delimited() {
1616        // r#"..."# style
1617        let mut lexer = Lexer::new(r##"r#"hello "world""#"##);
1618        match lexer.next_token() {
1619            Some((Token::RawStringDelimited(s), _)) => assert_eq!(s, r#"hello "world""#),
1620            other => panic!("Expected RawStringDelimited, got {:?}", other),
1621        }
1622    }
1623
1624    #[test]
1625    fn test_byte_string() {
1626        let mut lexer = Lexer::new(r#"b"hello""#);
1627        match lexer.next_token() {
1628            Some((Token::ByteStringLit(bytes), _)) => {
1629                assert_eq!(bytes, vec![104, 101, 108, 108, 111]); // "hello" in ASCII
1630            }
1631            other => panic!("Expected ByteStringLit, got {:?}", other),
1632        }
1633    }
1634
1635    #[test]
1636    fn test_interpolated_string() {
1637        let mut lexer = Lexer::new(r#"f"hello {name}""#);
1638        match lexer.next_token() {
1639            Some((Token::InterpolatedStringLit(s), _)) => assert_eq!(s, "hello {name}"),
1640            other => panic!("Expected InterpolatedStringLit, got {:?}", other),
1641        }
1642    }
1643
1644    #[test]
1645    fn test_sigil_string_sql() {
1646        let mut lexer = Lexer::new(r#"σ"SELECT * FROM {table}""#);
1647        match lexer.next_token() {
1648            Some((Token::SigilStringSql(s), _)) => assert_eq!(s, "SELECT * FROM {table}"),
1649            other => panic!("Expected SigilStringSql, got {:?}", other),
1650        }
1651    }
1652
1653    #[test]
1654    fn test_sigil_string_route() {
1655        let mut lexer = Lexer::new(r#"ρ"/api/v1/{resource}/{id}""#);
1656        match lexer.next_token() {
1657            Some((Token::SigilStringRoute(s), _)) => assert_eq!(s, "/api/v1/{resource}/{id}"),
1658            other => panic!("Expected SigilStringRoute, got {:?}", other),
1659        }
1660    }
1661
1662    #[test]
1663    fn test_unicode_in_strings() {
1664        // Test direct Unicode in strings
1665        let mut lexer = Lexer::new(r#""τφσρ 你好 🦀""#);
1666        match lexer.next_token() {
1667            Some((Token::StringLit(s), _)) => assert_eq!(s, "τφσρ 你好 🦀"),
1668            other => panic!("Expected StringLit, got {:?}", other),
1669        }
1670    }
1671
1672    #[test]
1673    fn test_empty_string() {
1674        let mut lexer = Lexer::new(r#""""#);
1675        match lexer.next_token() {
1676            Some((Token::StringLit(s), _)) => assert_eq!(s, ""),
1677            other => panic!("Expected empty StringLit, got {:?}", other),
1678        }
1679    }
1680
1681    #[test]
1682    fn test_escape_sequence_helper() {
1683        // Unit test the helper function directly
1684        assert_eq!(process_escape_sequences(r"hello\nworld"), "hello\nworld");
1685        assert_eq!(process_escape_sequences(r"hello\tworld"), "hello\tworld");
1686        assert_eq!(process_escape_sequences(r"hello\\world"), "hello\\world");
1687        assert_eq!(process_escape_sequences(r#"hello\"world"#), "hello\"world");
1688        assert_eq!(process_escape_sequences(r"hello\x41world"), "helloAworld");
1689        assert_eq!(
1690            process_escape_sequences(r"hello\u{1F600}world"),
1691            "hello😀world"
1692        );
1693    }
1694}
sigil_parser/lexer.rs

sigil_parser/
lexer.rs