sigil_parser/
lexer.rs

1//! Lexer for the Sigil programming language.
2//!
3//! Handles polysynthetic morphemes, evidentiality markers, and multi-base numerals.
4
5use crate::span::Span;
6use logos::Logos;
7
8/// Process escape sequences in a string literal.
9/// Converts \n, \t, \r, \\, \", \', \0, \xNN, \u{NNNN} to their actual characters.
10fn process_escape_sequences(s: &str) -> String {
11    let mut result = String::with_capacity(s.len());
12    let mut chars = s.chars().peekable();
13
14    while let Some(c) = chars.next() {
15        if c == '\\' {
16            match chars.next() {
17                Some('n') => result.push('\n'),
18                Some('t') => result.push('\t'),
19                Some('r') => result.push('\r'),
20                Some('\\') => result.push('\\'),
21                Some('"') => result.push('"'),
22                Some('\'') => result.push('\''),
23                Some('0') => result.push('\0'),
24                Some('x') => {
25                    // \xNN - two hex digits
26                    let mut hex = String::new();
27                    for _ in 0..2 {
28                        if let Some(&c) = chars.peek() {
29                            if c.is_ascii_hexdigit() {
30                                hex.push(chars.next().unwrap());
31                            }
32                        }
33                    }
34                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
35                        result.push(val as char);
36                    }
37                }
38                Some('u') => {
39                    // \u{NNNN} - Unicode code point
40                    if chars.peek() == Some(&'{') {
41                        chars.next(); // consume '{'
42                        let mut hex = String::new();
43                        while let Some(&c) = chars.peek() {
44                            if c == '}' {
45                                chars.next();
46                                break;
47                            }
48                            if c.is_ascii_hexdigit() {
49                                hex.push(chars.next().unwrap());
50                            } else {
51                                break;
52                            }
53                        }
54                        if let Ok(val) = u32::from_str_radix(&hex, 16) {
55                            if let Some(c) = char::from_u32(val) {
56                                result.push(c);
57                            }
58                        }
59                    }
60                }
61                Some(other) => {
62                    // Unknown escape, keep as-is
63                    result.push('\\');
64                    result.push(other);
65                }
66                None => result.push('\\'),
67            }
68        } else {
69            result.push(c);
70        }
71    }
72    result
73}
74
75/// Callback for delimited raw strings (r#"..."#).
76/// Reads until the closing "# is found.
77fn raw_string_delimited_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
78    let remainder = lex.remainder();
79
80    // Find the closing "#
81    if let Some(end_pos) = remainder.find("\"#") {
82        let content = &remainder[..end_pos];
83        // Bump past content and closing "# (2 chars)
84        lex.bump(end_pos + 2);
85        Some(content.to_string())
86    } else {
87        None
88    }
89}
90
91/// Callback for multi-line string literals.
92/// Reads from """ until the next """ is found.
93fn multiline_string_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
94    let remainder = lex.remainder();
95
96    // Find the closing """
97    if let Some(end_pos) = remainder.find("\"\"\"") {
98        let content = &remainder[..end_pos];
99        // Bump the lexer past the content and closing quotes
100        lex.bump(end_pos + 3);
101        Some(process_escape_sequences(content))
102    } else {
103        // No closing """ found - skip to end and return what we have
104        None
105    }
106}
107
108/// Process escape sequences in a character literal.
109fn process_char_escape(s: &str) -> char {
110    let mut chars = s.chars();
111    match chars.next() {
112        Some('\\') => match chars.next() {
113            Some('n') => '\n',
114            Some('t') => '\t',
115            Some('r') => '\r',
116            Some('\\') => '\\',
117            Some('"') => '"',
118            Some('\'') => '\'',
119            Some('0') => '\0',
120            Some('x') => {
121                let hex: String = chars.take(2).collect();
122                u8::from_str_radix(&hex, 16)
123                    .map(|v| v as char)
124                    .unwrap_or('?')
125            }
126            Some('u') => {
127                if chars.next() == Some('{') {
128                    let hex: String = chars.take_while(|&c| c != '}').collect();
129                    u32::from_str_radix(&hex, 16)
130                        .ok()
131                        .and_then(char::from_u32)
132                        .unwrap_or('?')
133                } else {
134                    '?'
135                }
136            }
137            Some(c) => c,
138            None => '?',
139        },
140        Some(c) => c,
141        None => '?',
142    }
143}
144
145/// Token types for Sigil.
146#[derive(Logos, Debug, Clone, PartialEq)]
147#[logos(skip r"[ \t\r\n\f]+")]
148pub enum Token {
149    // === Comments ===
150    #[regex(r"//[^\n]*", |lex| lex.slice().to_string())]
151    LineComment(String),
152
153    #[regex(r"//![^\n]*", |lex| lex.slice().to_string())]
154    DocComment(String),
155
156    // === Keywords ===
157    #[token("fn")]
158    Fn,
159    #[token("async")]
160    Async,
161    #[token("let")]
162    Let,
163    #[token("mut")]
164    Mut,
165    #[token("const")]
166    Const,
167    #[token("type")]
168    Type,
169    #[token("struct")]
170    Struct,
171    #[token("enum")]
172    Enum,
173    #[token("trait")]
174    Trait,
175    #[token("impl")]
176    Impl,
177    #[token("mod")]
178    Mod,
179    #[token("use")]
180    Use,
181    #[token("pub")]
182    Pub,
183    #[token("actor")]
184    Actor,
185    #[token("saga")]
186    Saga,
187    #[token("scope")]
188    Scope,
189    #[token("rune")]
190    Rune,
191
192    // Control flow
193    #[token("if")]
194    If,
195    #[token("else")]
196    Else,
197    #[token("match")]
198    Match,
199    #[token("loop")]
200    Loop,
201    #[token("while")]
202    While,
203    #[token("for")]
204    For,
205    #[token("in")]
206    In,
207    #[token("break")]
208    Break,
209    #[token("continue")]
210    Continue,
211    #[token("return")]
212    Return,
213    #[token("yield")]
214    Yield,
215    #[token("await")]
216    Await,
217
218    // Other keywords
219    #[token("self")]
220    SelfLower,
221    #[token("Self")]
222    SelfUpper,
223    #[token("super")]
224    Super,
225    #[token("crate")]
226    Crate,
227    #[token("where")]
228    Where,
229    #[token("as")]
230    As,
231    #[token("dyn")]
232    Dyn,
233    #[token("move")]
234    Move,
235    #[token("ref")]
236    Ref,
237    #[token("static")]
238    Static,
239    #[token("unsafe")]
240    Unsafe,
241    #[token("extern")]
242    Extern,
243    #[token("asm")]
244    Asm,
245    #[token("volatile")]
246    Volatile,
247    #[token("naked")]
248    Naked,
249    #[token("packed")]
250    Packed,
251    #[token("simd")]
252    Simd,
253    #[token("atomic")]
254    Atomic,
255    #[token("derive")]
256    Derive,
257    #[token("on")]
258    On,
259
260    // Boolean literals
261    #[token("true")]
262    True,
263    #[token("false")]
264    False,
265
266    // Null literal
267    #[token("null")]
268    Null,
269
270    // === Morphemes (Greek letters) ===
271    #[token("τ")]
272    #[token("Τ")]
273    Tau, // Transform/map
274
275    #[token("φ")]
276    #[token("Φ")]
277    Phi, // Filter
278
279    #[token("σ")]
280    #[token("Σ")]
281    Sigma, // Sort (lowercase) / Sum (uppercase)
282
283    #[token("ρ")]
284    #[token("Ρ")]
285    Rho, // Reduce
286
287    #[token("λ")]
288    #[token("Λ")]
289    Lambda, // Lambda
290
291    #[token("Π")]
292    Pi, // Product
293
294    #[token("⌛")]
295    Hourglass, // Await symbol
296
297    // Additional morphemes
298    #[token("δ")]
299    #[token("Δ")]
300    Delta, // Difference/change
301
302    #[token("ε")]
303    Epsilon, // Empty/null
304
305    #[token("ω")]
306    #[token("Ω")]
307    Omega, // End/terminal
308
309    #[token("α")]
310    Alpha, // First element
311
312    #[token("ζ")]
313    Zeta, // Zip/combine
314
315    // === Additional Access Morphemes ===
316    #[token("μ")]
317    #[token("Μ")]
318    Mu, // Middle/median element
319
320    #[token("χ")]
321    #[token("Χ")]
322    Chi, // Random/choice (from chaos)
323
324    #[token("ν")]
325    #[token("Ν")]
326    Nu, // Nth element (ordinal)
327
328    #[token("ξ")]
329    #[token("Ξ")]
330    Xi, // Next in sequence
331
332    // === Parallel/Concurrency Morphemes ===
333    #[token("∥")]
334    #[token("parallel")]
335    Parallel, // Parallel execution (U+2225)
336
337    #[token("⊛")]
338    #[token("gpu")]
339    Gpu, // GPU compute shader (U+229B - circled asterisk)
340
341    // === Quantifiers (for AI-native set operations) ===
342    #[token("∀")]
343    ForAll, // Universal quantification
344
345    #[token("∃")]
346    Exists, // Existential quantification
347
348    #[token("∈")]
349    ElementOf, // Membership test
350
351    #[token("∉")]
352    NotElementOf, // Non-membership
353
354    // === Set Operations ===
355    #[token("∪")]
356    Union, // Set union
357
358    #[token("∩")]
359    Intersection, // Set intersection
360
361    #[token("∖")]
362    SetMinus, // Set difference
363
364    #[token("⊂")]
365    Subset, // Proper subset
366
367    #[token("⊆")]
368    SubsetEq, // Subset or equal
369
370    #[token("⊃")]
371    Superset, // Proper superset
372
373    #[token("⊇")]
374    SupersetEq, // Superset or equal
375
376    // === Logic Operators ===
377    #[token("∧")]
378    LogicAnd, // Logical conjunction
379
380    #[token("∨")]
381    LogicOr, // Logical disjunction
382
383    #[token("¬")]
384    LogicNot, // Logical negation
385
386    #[token("⊻")]
387    LogicXor, // Exclusive or
388
389    #[token("⊤")]
390    Top, // True/any type
391
392    #[token("⊥")]
393    Bottom, // False/never type
394
395    // === Bitwise Operators (Unicode) ===
396    #[token("⋏")]
397    BitwiseAndSymbol, // Bitwise AND (U+22CF)
398
399    #[token("⋎")]
400    BitwiseOrSymbol, // Bitwise OR (U+22CE)
401
402    // === Type Theory ===
403    #[token("∷")]
404    TypeAnnotation, // Type annotation (alternative to :)
405
406    // === Analysis/Calculus ===
407    #[token("∫")]
408    Integral, // Cumulative sum
409
410    #[token("∂")]
411    Partial, // Discrete derivative
412
413    #[token("√")]
414    Sqrt, // Square root
415
416    #[token("∛")]
417    Cbrt, // Cube root
418
419    #[token("∇")]
420    Nabla, // Gradient (U+2207)
421
422    // === APL-Inspired Symbols ===
423    #[token("⍋")]
424    GradeUp, // Sort ascending (U+234B)
425
426    #[token("⍒")]
427    GradeDown, // Sort descending (U+2352)
428
429    #[token("⌽")]
430    Rotate, // Reverse/rotate (U+233D)
431
432    #[token("↻")]
433    CycleArrow, // Cycle/repeat (U+21BB)
434
435    #[token("⌺")]
436    QuadDiamond, // Windows/stencil (U+233A)
437
438    #[token("⊞")]
439    SquaredPlus, // Chunks (U+229E)
440
441    #[token("⍳")]
442    Iota, // Enumerate/index (U+2373)
443
444    // === Category Theory ===
445    #[token("∘")]
446    Compose, // Function composition
447
448    #[token("⊗")]
449    Tensor, // Tensor product
450
451    #[token("⊕")]
452    DirectSum, // Direct sum / XOR
453
454    // === Data Operations ===
455    #[token("⋈")]
456    Bowtie, // Join/zip combining (U+22C8)
457
458    #[token("⋳")]
459    ElementSmallVerticalBar, // Flatten (U+22F3)
460
461    #[token("⊔")]
462    SquareCup, // Lattice join / supremum (U+2294)
463
464    #[token("⊓")]
465    SquareCap, // Lattice meet / infimum (U+2293)
466
467    // === Evidentiality Markers ===
468    // Note: These are handled contextually since ! and ? have other uses
469    #[token("‽")]
470    Interrobang, // Paradox/trust boundary
471
472    // === Affective Markers (Sentiment & Emotion) ===
473    // Sentiment polarity
474    #[token("⊖")]
475    AffectNegative, // Negative sentiment (U+2296 Circled Minus)
476
477    #[token("⊜")]
478    AffectNeutral, // Neutral sentiment (U+229C Circled Equals)
479
480    // Note: ⊕ (U+2295) is already DirectSum - we'll use it dual-purpose for positive sentiment
481
482    // Sarcasm/Irony
483    #[token("⸮")]
484    IronyMark, // Irony/sarcasm marker (U+2E2E - historical percontation point!)
485
486    // Intensity modifiers
487    #[token("↑")]
488    IntensityUp, // Intensifier (U+2191)
489
490    #[token("↓")]
491    IntensityDown, // Dampener (U+2193)
492
493    #[token("⇈")]
494    IntensityMax, // Maximum intensity (U+21C8)
495
496    // Formality register
497    #[token("♔")]
498    FormalRegister, // Formal (U+2654 White King)
499
500    #[token("♟")]
501    InformalRegister, // Informal (U+265F Black Pawn)
502
503    // Emotion markers (Plutchik's wheel)
504    #[token("☺")]
505    EmotionJoy, // Joy (U+263A)
506
507    #[token("☹")]
508    EmotionSadness, // Sadness (U+2639)
509
510    #[token("⚡")]
511    EmotionAnger, // Anger (U+26A1)
512
513    #[token("❄")]
514    EmotionFear, // Fear (U+2744)
515
516    #[token("✦")]
517    EmotionSurprise, // Surprise (U+2726)
518
519    #[token("♡")]
520    EmotionLove, // Love/Trust (U+2661)
521
522    // Confidence markers
523    #[token("◉")]
524    ConfidenceHigh, // High confidence (U+25C9)
525
526    #[token("◎")]
527    ConfidenceMedium, // Medium confidence (U+25CE)
528
529    #[token("○")]
530    ConfidenceLow, // Low confidence (U+25CB)
531
532    // === Aspect Morphemes (verb aspects) ===
533    #[token("·ing")]
534    AspectProgressive, // Ongoing/streaming aspect
535
536    #[token("·ed")]
537    AspectPerfective, // Completed aspect
538
539    #[token("·able")]
540    AspectPotential, // Capability aspect
541
542    #[token("·ive")]
543    AspectResultative, // Result-producing aspect
544
545    // === Operators ===
546    #[token("|")]
547    Pipe,
548    #[token("·")]
549    MiddleDot, // Incorporation
550    #[token("->")]
551    Arrow,
552    #[token("=>")]
553    FatArrow,
554    #[token("<-")]
555    LeftArrow,
556    #[token("==")]
557    EqEq,
558    #[token("!=")]
559    NotEq,
560    #[token("<=")]
561    LtEq,
562    #[token(">=")]
563    GtEq,
564    #[token("<")]
565    Lt,
566    #[token(">")]
567    Gt,
568    #[token("+")]
569    Plus,
570    #[token("-")]
571    Minus,
572    #[token("*")]
573    Star,
574    #[token("/")]
575    Slash,
576    #[token("%")]
577    Percent,
578    #[token("**")]
579    StarStar, // Exponentiation
580    #[token("&&")]
581    AndAnd,
582    #[token("||")]
583    OrOr,
584    #[token("!")]
585    Bang, // Evidentiality: known / logical not
586    #[token("?")]
587    Question, // Evidentiality: uncertain / try
588    #[token("~")]
589    Tilde, // Evidentiality: reported
590    #[token("&")]
591    Amp,
592    #[token("^")]
593    Caret,
594    #[token("<<")]
595    Shl,
596    #[token(">>")]
597    Shr,
598    #[token("=")]
599    Eq,
600    #[token("+=")]
601    PlusEq,
602    #[token("-=")]
603    MinusEq,
604    #[token("*=")]
605    StarEq,
606    #[token("/=")]
607    SlashEq,
608    #[token("..")]
609    DotDot,
610    #[token("..=")]
611    DotDotEq,
612    #[token("++")]
613    PlusPlus, // Concatenation
614    #[token("::")]
615    ColonColon,
616    #[token(":")]
617    Colon,
618    #[token(";")]
619    Semi,
620    #[token(",")]
621    Comma,
622    #[token(".")]
623    Dot,
624    #[token("@")]
625    At,
626    #[token("#!")]
627    HashBang, // Inner attribute prefix #![...]
628    #[token("#")]
629    Hash,
630    #[token("_", priority = 3)]
631    Underscore,
632
633    // === Delimiters ===
634    #[token("(")]
635    LParen,
636    #[token(")")]
637    RParen,
638    #[token("{")]
639    LBrace,
640    #[token("}")]
641    RBrace,
642    #[token("[")]
643    LBracket,
644    #[token("]")]
645    RBracket,
646
647    // === Special symbols ===
648    #[token("∅")]
649    Empty, // Void/emptiness (śūnya)
650    #[token("◯")]
651    Circle, // Geometric zero
652    #[token("∞")]
653    Infinity, // Ananta
654
655    // === Protocol Operations (Sigil-native networking) ===
656    #[token("⇒")]
657    ProtoSend, // Send data (U+21D2 - rightwards double arrow)
658
659    #[token("⇐")]
660    ProtoRecv, // Receive data (U+21D0 - leftwards double arrow)
661
662    #[token("≋")]
663    ProtoStream, // Stream data (U+224B - triple tilde)
664
665    #[token("⊸")]
666    ProtoConnect, // Connect/lollipop (U+22B8 - multimap)
667
668    #[token("⏱")]
669    ProtoTimeout, // Timeout (U+23F1 - stopwatch)
670
671    // Note: ⊗ (Tensor) is used for close in protocol contexts
672
673    // Protocol keywords for ASCII fallback
674    #[token("send")]
675    Send,
676    #[token("recv")]
677    Recv,
678    #[token("stream")]
679    Stream,
680    #[token("connect")]
681    Connect,
682    #[token("close")]
683    Close,
684    #[token("timeout")]
685    Timeout,
686    #[token("retry")]
687    Retry,
688    #[token("header")]
689    Header,
690    #[token("body")]
691    Body,
692
693    // Protocol type identifiers (for incorporation: http·, ws·, grpc·, kafka·)
694    #[token("http")]
695    Http,
696    #[token("https")]
697    Https,
698    #[token("ws")]
699    Ws,
700    #[token("wss")]
701    Wss,
702    #[token("grpc")]
703    Grpc,
704    #[token("kafka")]
705    Kafka,
706    #[token("amqp")]
707    Amqp,
708    #[token("graphql")]
709    GraphQL,
710
711    // === Numbers ===
712    // Binary: 0b...
713    #[regex(r"0b[01_]+", |lex| lex.slice().to_string())]
714    BinaryLit(String),
715
716    // Octal: 0o...
717    #[regex(r"0o[0-7_]+", |lex| lex.slice().to_string())]
718    OctalLit(String),
719
720    // Hex: 0x...
721    #[regex(r"0x[0-9a-fA-F_]+", |lex| lex.slice().to_string())]
722    HexLit(String),
723
724    // Vigesimal: 0v... (base 20)
725    #[regex(r"0v[0-9a-jA-J_]+", |lex| lex.slice().to_string())]
726    VigesimalLit(String),
727
728    // Sexagesimal: 0s... (base 60)
729    #[regex(r"0s[0-9a-zA-Z_]+", |lex| lex.slice().to_string())]
730    SexagesimalLit(String),
731
732    // Duodecimal: 0z... (base 12)
733    #[regex(r"0z[0-9a-bA-B_]+", |lex| lex.slice().to_string())]
734    DuodecimalLit(String),
735
736    // Float: 123.456 or 1.23e10
737    #[regex(r"[0-9][0-9_]*\.[0-9][0-9_]*([eE][+-]?[0-9_]+)?", |lex| lex.slice().to_string())]
738    FloatLit(String),
739
740    // Integer: 123
741    #[regex(r"[0-9][0-9_]*", |lex| lex.slice().to_string())]
742    IntLit(String),
743
744    // === Strings ===
745    // Regular string with escape sequence processing
746    #[regex(r#""([^"\\]|\\.)*""#, |lex| {
747        let s = lex.slice();
748        let inner = &s[1..s.len()-1];
749        process_escape_sequences(inner)
750    })]
751    StringLit(String),
752
753    // Multi-line string (triple-quoted) - handled via callback
754    #[token(r#"""""#, multiline_string_callback)]
755    MultiLineStringLit(String),
756
757    // Byte string literal
758    #[regex(r#"b"([^"\\]|\\.)*""#, |lex| {
759        let s = lex.slice();
760        let inner = &s[2..s.len()-1];
761        inner.as_bytes().to_vec()
762    })]
763    ByteStringLit(Vec<u8>),
764
765    // Interpolated string (will be parsed further for expressions)
766    #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
767        let s = lex.slice();
768        let inner = &s[2..s.len()-1];
769        process_escape_sequences(inner)
770    })]
771    InterpolatedStringLit(String),
772
773    // Sigil string - SQL template (σ prefix)
774    #[regex(r#"σ"([^"\\]|\\.)*""#, |lex| {
775        let s = lex.slice();
776        // Get byte index after the σ character (which is 2 bytes in UTF-8)
777        let start = "σ".len() + 1; // σ + opening quote
778        let inner = &s[start..s.len()-1];
779        process_escape_sequences(inner)
780    })]
781    SigilStringSql(String),
782
783    // Sigil string - Route template (ρ prefix)
784    #[regex(r#"ρ"([^"\\]|\\.)*""#, |lex| {
785        let s = lex.slice();
786        // Get byte index after the ρ character (which is 2 bytes in UTF-8)
787        let start = "ρ".len() + 1; // ρ + opening quote
788        let inner = &s[start..s.len()-1];
789        process_escape_sequences(inner)
790    })]
791    SigilStringRoute(String),
792
793    // Char literal with escape sequence processing
794    #[regex(r"'([^'\\]|\\.)'", |lex| {
795        let s = lex.slice();
796        let inner = &s[1..s.len()-1];
797        process_char_escape(inner)
798    })]
799    CharLit(char),
800
801    // Raw string (no escape processing)
802    #[regex(r#"r"[^"]*""#, |lex| {
803        let s = lex.slice();
804        s[2..s.len()-1].to_string()
805    })]
806    RawStringLit(String),
807
808    // Raw string with delimiter (r#"..."# style) - handles internal quotes
809    #[token(r##"r#""##, raw_string_delimited_callback)]
810    RawStringDelimited(String),
811
812    // === Identifiers ===
813    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
814    Ident(String),
815
816    // === Rune annotation ===
817    #[regex(r"//@\s*rune:\s*[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
818    RuneAnnotation(String),
819}
820
821impl Token {
822    pub fn is_keyword(&self) -> bool {
823        matches!(
824            self,
825            Token::Fn
826                | Token::Async
827                | Token::Let
828                | Token::Mut
829                | Token::Const
830                | Token::Type
831                | Token::Struct
832                | Token::Enum
833                | Token::Trait
834                | Token::Impl
835                | Token::Mod
836                | Token::Use
837                | Token::Pub
838                | Token::Actor
839                | Token::Saga
840                | Token::Scope
841                | Token::Rune
842                | Token::If
843                | Token::Else
844                | Token::Match
845                | Token::Loop
846                | Token::While
847                | Token::For
848                | Token::In
849                | Token::Break
850                | Token::Continue
851                | Token::Return
852                | Token::Yield
853                | Token::Await
854        )
855    }
856
857    pub fn is_morpheme(&self) -> bool {
858        matches!(
859            self,
860            Token::Tau | Token::Phi | Token::Sigma | Token::Rho |
861            Token::Lambda | Token::Pi | Token::Hourglass |
862            Token::Delta | Token::Epsilon | Token::Omega | Token::Alpha | Token::Zeta |
863            Token::Mu | Token::Chi | Token::Nu | Token::Xi |  // Access morphemes
864            Token::Parallel | Token::Gpu |  // Concurrency morphemes
865            Token::Integral | Token::Partial | Token::Sqrt | Token::Cbrt |
866            Token::Compose
867        )
868    }
869
870    pub fn is_aspect(&self) -> bool {
871        matches!(
872            self,
873            Token::AspectProgressive
874                | Token::AspectPerfective
875                | Token::AspectPotential
876                | Token::AspectResultative
877        )
878    }
879
880    pub fn is_data_op(&self) -> bool {
881        matches!(
882            self,
883            Token::Bowtie | Token::ElementSmallVerticalBar | Token::SquareCup | Token::SquareCap
884        )
885    }
886
887    pub fn is_bitwise_symbol(&self) -> bool {
888        matches!(self, Token::BitwiseAndSymbol | Token::BitwiseOrSymbol)
889    }
890
891    pub fn is_quantifier(&self) -> bool {
892        matches!(
893            self,
894            Token::ForAll | Token::Exists | Token::ElementOf | Token::NotElementOf
895        )
896    }
897
898    pub fn is_set_op(&self) -> bool {
899        matches!(
900            self,
901            Token::Union
902                | Token::Intersection
903                | Token::SetMinus
904                | Token::Subset
905                | Token::SubsetEq
906                | Token::Superset
907                | Token::SupersetEq
908        )
909    }
910
911    pub fn is_logic_op(&self) -> bool {
912        matches!(
913            self,
914            Token::LogicAnd
915                | Token::LogicOr
916                | Token::LogicNot
917                | Token::LogicXor
918                | Token::Top
919                | Token::Bottom
920        )
921    }
922
923    pub fn is_evidentiality(&self) -> bool {
924        matches!(
925            self,
926            Token::Bang | Token::Question | Token::Tilde | Token::Interrobang
927        )
928    }
929
930    pub fn is_affective(&self) -> bool {
931        matches!(
932            self,
933            // Sentiment
934            Token::DirectSum |  // ⊕ positive (dual-purpose with DirectSum)
935            Token::AffectNegative |  // ⊖ negative
936            Token::AffectNeutral |  // ⊜ neutral
937            // Sarcasm
938            Token::IronyMark |  // ⸮ irony/sarcasm
939            // Intensity
940            Token::IntensityUp |  // ↑
941            Token::IntensityDown |  // ↓
942            Token::IntensityMax |  // ⇈
943            // Formality
944            Token::FormalRegister |  // ♔
945            Token::InformalRegister |  // ♟
946            // Emotions
947            Token::EmotionJoy |  // ☺
948            Token::EmotionSadness |  // ☹
949            Token::EmotionAnger |  // ⚡
950            Token::EmotionFear |  // ❄
951            Token::EmotionSurprise |  // ✦
952            Token::EmotionLove |  // ♡
953            // Confidence
954            Token::ConfidenceHigh |  // ◉
955            Token::ConfidenceMedium |  // ◎
956            Token::ConfidenceLow // ○
957        )
958    }
959
960    pub fn is_sentiment(&self) -> bool {
961        matches!(
962            self,
963            Token::DirectSum | Token::AffectNegative | Token::AffectNeutral
964        )
965    }
966
967    pub fn is_emotion(&self) -> bool {
968        matches!(
969            self,
970            Token::EmotionJoy
971                | Token::EmotionSadness
972                | Token::EmotionAnger
973                | Token::EmotionFear
974                | Token::EmotionSurprise
975                | Token::EmotionLove
976        )
977    }
978
979    pub fn is_intensity(&self) -> bool {
980        matches!(
981            self,
982            Token::IntensityUp | Token::IntensityDown | Token::IntensityMax
983        )
984    }
985}
986
987/// Lexer wrapping Logos for Sigil.
988pub struct Lexer<'a> {
989    inner: logos::Lexer<'a, Token>,
990    peeked: Option<Option<(Token, Span)>>,
991}
992
993impl<'a> Lexer<'a> {
994    pub fn new(source: &'a str) -> Self {
995        Self {
996            inner: Token::lexer(source),
997            peeked: None,
998        }
999    }
1000
1001    pub fn next_token(&mut self) -> Option<(Token, Span)> {
1002        if let Some(peeked) = self.peeked.take() {
1003            return peeked;
1004        }
1005
1006        match self.inner.next() {
1007            Some(Ok(token)) => {
1008                let span = self.inner.span();
1009                Some((token, Span::new(span.start, span.end)))
1010            }
1011            Some(Err(_)) => {
1012                // Skip invalid tokens and try next
1013                self.next_token()
1014            }
1015            None => None,
1016        }
1017    }
1018
1019    pub fn peek(&mut self) -> Option<&(Token, Span)> {
1020        if self.peeked.is_none() {
1021            self.peeked = Some(self.next_token());
1022        }
1023        self.peeked.as_ref().and_then(|p| p.as_ref())
1024    }
1025
1026    pub fn span(&self) -> Span {
1027        let span = self.inner.span();
1028        Span::new(span.start, span.end)
1029    }
1030}
1031
1032#[cfg(test)]
1033mod tests {
1034    use super::*;
1035
1036    #[test]
1037    fn test_morphemes() {
1038        let mut lexer = Lexer::new("τ φ σ ρ λ Σ Π ⌛");
1039        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1040        assert!(matches!(lexer.next_token(), Some((Token::Phi, _))));
1041        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1042        assert!(matches!(lexer.next_token(), Some((Token::Rho, _))));
1043        assert!(matches!(lexer.next_token(), Some((Token::Lambda, _))));
1044        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1045        assert!(matches!(lexer.next_token(), Some((Token::Pi, _))));
1046        assert!(matches!(lexer.next_token(), Some((Token::Hourglass, _))));
1047    }
1048
1049    #[test]
1050    fn test_evidentiality() {
1051        let mut lexer = Lexer::new("value! uncertain? reported~ paradox‽");
1052        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "value"));
1053        assert!(matches!(lexer.next_token(), Some((Token::Bang, _))));
1054        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "uncertain"));
1055        assert!(matches!(lexer.next_token(), Some((Token::Question, _))));
1056        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "reported"));
1057        assert!(matches!(lexer.next_token(), Some((Token::Tilde, _))));
1058        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "paradox"));
1059        assert!(matches!(lexer.next_token(), Some((Token::Interrobang, _))));
1060    }
1061
1062    #[test]
1063    fn test_pipe_chain() {
1064        let mut lexer = Lexer::new("data|τ{f}|φ{p}|σ");
1065        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "data"));
1066        assert!(matches!(lexer.next_token(), Some((Token::Pipe, _))));
1067        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1068        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1069        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1070        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1071    }
1072
1073    #[test]
1074    fn test_numbers() {
1075        let mut lexer = Lexer::new("42 0b1010 0o52 0x2A 0v22 0s42 3.14");
1076        assert!(matches!(lexer.next_token(), Some((Token::IntLit(s), _)) if s == "42"));
1077        assert!(matches!(lexer.next_token(), Some((Token::BinaryLit(s), _)) if s == "0b1010"));
1078        assert!(matches!(lexer.next_token(), Some((Token::OctalLit(s), _)) if s == "0o52"));
1079        assert!(matches!(lexer.next_token(), Some((Token::HexLit(s), _)) if s == "0x2A"));
1080        assert!(matches!(lexer.next_token(), Some((Token::VigesimalLit(s), _)) if s == "0v22"));
1081        assert!(matches!(lexer.next_token(), Some((Token::SexagesimalLit(s), _)) if s == "0s42"));
1082        assert!(matches!(lexer.next_token(), Some((Token::FloatLit(s), _)) if s == "3.14"));
1083    }
1084
1085    #[test]
1086    fn test_incorporation() {
1087        let mut lexer = Lexer::new("file·open·read");
1088        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "file"));
1089        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1090        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "open"));
1091        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1092        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "read"));
1093    }
1094
1095    #[test]
1096    fn test_special_symbols() {
1097        let mut lexer = Lexer::new("∅ ◯ ∞");
1098        assert!(matches!(lexer.next_token(), Some((Token::Empty, _))));
1099        assert!(matches!(lexer.next_token(), Some((Token::Circle, _))));
1100        assert!(matches!(lexer.next_token(), Some((Token::Infinity, _))));
1101    }
1102
1103    #[test]
1104    fn test_quantifiers() {
1105        let mut lexer = Lexer::new("∀x ∃y x∈S y∉T");
1106        assert!(matches!(lexer.next_token(), Some((Token::ForAll, _))));
1107        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1108        assert!(matches!(lexer.next_token(), Some((Token::Exists, _))));
1109        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "y"));
1110        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1111        assert!(matches!(lexer.next_token(), Some((Token::ElementOf, _))));
1112    }
1113
1114    #[test]
1115    fn test_set_operations() {
1116        let mut lexer = Lexer::new("A∪B A∩B A∖B A⊂B A⊆B");
1117        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1118        assert!(matches!(lexer.next_token(), Some((Token::Union, _))));
1119        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "B"));
1120        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1121        assert!(matches!(lexer.next_token(), Some((Token::Intersection, _))));
1122    }
1123
1124    #[test]
1125    fn test_logic_operators() {
1126        let mut lexer = Lexer::new("p∧q p∨q ¬p p⊻q ⊤ ⊥");
1127        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1128        assert!(matches!(lexer.next_token(), Some((Token::LogicAnd, _))));
1129        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "q"));
1130        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1131        assert!(matches!(lexer.next_token(), Some((Token::LogicOr, _))));
1132    }
1133
1134    #[test]
1135    fn test_analysis_operators() {
1136        let mut lexer = Lexer::new("∫f ∂g √x ∛y f∘g");
1137        assert!(matches!(lexer.next_token(), Some((Token::Integral, _))));
1138        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1139        assert!(matches!(lexer.next_token(), Some((Token::Partial, _))));
1140        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "g"));
1141        assert!(matches!(lexer.next_token(), Some((Token::Sqrt, _))));
1142    }
1143
1144    #[test]
1145    fn test_additional_morphemes() {
1146        let mut lexer = Lexer::new("δ ε ω α ζ");
1147        assert!(matches!(lexer.next_token(), Some((Token::Delta, _))));
1148        assert!(matches!(lexer.next_token(), Some((Token::Epsilon, _))));
1149        assert!(matches!(lexer.next_token(), Some((Token::Omega, _))));
1150        assert!(matches!(lexer.next_token(), Some((Token::Alpha, _))));
1151        assert!(matches!(lexer.next_token(), Some((Token::Zeta, _))));
1152    }
1153
1154    #[test]
1155    fn test_ffi_keywords() {
1156        let mut lexer = Lexer::new("extern unsafe");
1157        assert!(matches!(lexer.next_token(), Some((Token::Extern, _))));
1158        assert!(matches!(lexer.next_token(), Some((Token::Unsafe, _))));
1159    }
1160
1161    #[test]
1162    fn test_parallel_morphemes() {
1163        let mut lexer = Lexer::new("∥ parallel ⊛ gpu");
1164        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1165        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1166        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1167        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1168    }
1169
1170    // ==================== STRING LITERAL TESTS ====================
1171
1172    #[test]
1173    fn test_string_escape_sequences() {
1174        // Test basic escape sequences
1175        let mut lexer = Lexer::new(r#""hello\nworld""#);
1176        match lexer.next_token() {
1177            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\nworld"),
1178            other => panic!("Expected StringLit, got {:?}", other),
1179        }
1180
1181        // Test tab escape
1182        let mut lexer = Lexer::new(r#""hello\tworld""#);
1183        match lexer.next_token() {
1184            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\tworld"),
1185            other => panic!("Expected StringLit, got {:?}", other),
1186        }
1187
1188        // Test carriage return
1189        let mut lexer = Lexer::new(r#""hello\rworld""#);
1190        match lexer.next_token() {
1191            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\rworld"),
1192            other => panic!("Expected StringLit, got {:?}", other),
1193        }
1194
1195        // Test escaped backslash
1196        let mut lexer = Lexer::new(r#""hello\\world""#);
1197        match lexer.next_token() {
1198            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\\world"),
1199            other => panic!("Expected StringLit, got {:?}", other),
1200        }
1201
1202        // Test escaped quote
1203        let mut lexer = Lexer::new(r#""hello\"world""#);
1204        match lexer.next_token() {
1205            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\"world"),
1206            other => panic!("Expected StringLit, got {:?}", other),
1207        }
1208
1209        // Test null character
1210        let mut lexer = Lexer::new(r#""hello\0world""#);
1211        match lexer.next_token() {
1212            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\0world"),
1213            other => panic!("Expected StringLit, got {:?}", other),
1214        }
1215    }
1216
1217    #[test]
1218    fn test_string_hex_escape() {
1219        // Test \xNN hex escape
1220        let mut lexer = Lexer::new(r#""hello\x41world""#);
1221        match lexer.next_token() {
1222            Some((Token::StringLit(s), _)) => assert_eq!(s, "helloAworld"),
1223            other => panic!("Expected StringLit, got {:?}", other),
1224        }
1225    }
1226
1227    #[test]
1228    fn test_string_unicode_escape() {
1229        // Test \u{NNNN} Unicode escape
1230        let mut lexer = Lexer::new(r#""hello\u{1F600}world""#);
1231        match lexer.next_token() {
1232            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello😀world"),
1233            other => panic!("Expected StringLit, got {:?}", other),
1234        }
1235
1236        // Test Greek letter
1237        let mut lexer = Lexer::new(r#""\u{03C4}""#);
1238        match lexer.next_token() {
1239            Some((Token::StringLit(s), _)) => assert_eq!(s, "τ"),
1240            other => panic!("Expected StringLit, got {:?}", other),
1241        }
1242    }
1243
1244    #[test]
1245    fn test_char_escape_sequences() {
1246        let mut lexer = Lexer::new(r"'\n'");
1247        match lexer.next_token() {
1248            Some((Token::CharLit(c), _)) => assert_eq!(c, '\n'),
1249            other => panic!("Expected CharLit, got {:?}", other),
1250        }
1251
1252        let mut lexer = Lexer::new(r"'\t'");
1253        match lexer.next_token() {
1254            Some((Token::CharLit(c), _)) => assert_eq!(c, '\t'),
1255            other => panic!("Expected CharLit, got {:?}", other),
1256        }
1257
1258        let mut lexer = Lexer::new(r"'\\'");
1259        match lexer.next_token() {
1260            Some((Token::CharLit(c), _)) => assert_eq!(c, '\\'),
1261            other => panic!("Expected CharLit, got {:?}", other),
1262        }
1263    }
1264
1265    #[test]
1266    fn test_raw_string() {
1267        // Raw strings should NOT process escapes
1268        let mut lexer = Lexer::new(r#"r"hello\nworld""#);
1269        match lexer.next_token() {
1270            Some((Token::RawStringLit(s), _)) => assert_eq!(s, r"hello\nworld"),
1271            other => panic!("Expected RawStringLit, got {:?}", other),
1272        }
1273    }
1274
1275    #[test]
1276    fn test_raw_string_delimited() {
1277        // r#"..."# style
1278        let mut lexer = Lexer::new(r##"r#"hello "world""#"##);
1279        match lexer.next_token() {
1280            Some((Token::RawStringDelimited(s), _)) => assert_eq!(s, r#"hello "world""#),
1281            other => panic!("Expected RawStringDelimited, got {:?}", other),
1282        }
1283    }
1284
1285    #[test]
1286    fn test_byte_string() {
1287        let mut lexer = Lexer::new(r#"b"hello""#);
1288        match lexer.next_token() {
1289            Some((Token::ByteStringLit(bytes), _)) => {
1290                assert_eq!(bytes, vec![104, 101, 108, 108, 111]); // "hello" in ASCII
1291            }
1292            other => panic!("Expected ByteStringLit, got {:?}", other),
1293        }
1294    }
1295
1296    #[test]
1297    fn test_interpolated_string() {
1298        let mut lexer = Lexer::new(r#"f"hello {name}""#);
1299        match lexer.next_token() {
1300            Some((Token::InterpolatedStringLit(s), _)) => assert_eq!(s, "hello {name}"),
1301            other => panic!("Expected InterpolatedStringLit, got {:?}", other),
1302        }
1303    }
1304
1305    #[test]
1306    fn test_sigil_string_sql() {
1307        let mut lexer = Lexer::new(r#"σ"SELECT * FROM {table}""#);
1308        match lexer.next_token() {
1309            Some((Token::SigilStringSql(s), _)) => assert_eq!(s, "SELECT * FROM {table}"),
1310            other => panic!("Expected SigilStringSql, got {:?}", other),
1311        }
1312    }
1313
1314    #[test]
1315    fn test_sigil_string_route() {
1316        let mut lexer = Lexer::new(r#"ρ"/api/v1/{resource}/{id}""#);
1317        match lexer.next_token() {
1318            Some((Token::SigilStringRoute(s), _)) => assert_eq!(s, "/api/v1/{resource}/{id}"),
1319            other => panic!("Expected SigilStringRoute, got {:?}", other),
1320        }
1321    }
1322
1323    #[test]
1324    fn test_unicode_in_strings() {
1325        // Test direct Unicode in strings
1326        let mut lexer = Lexer::new(r#""τφσρ 你好 🦀""#);
1327        match lexer.next_token() {
1328            Some((Token::StringLit(s), _)) => assert_eq!(s, "τφσρ 你好 🦀"),
1329            other => panic!("Expected StringLit, got {:?}", other),
1330        }
1331    }
1332
1333    #[test]
1334    fn test_empty_string() {
1335        let mut lexer = Lexer::new(r#""""#);
1336        match lexer.next_token() {
1337            Some((Token::StringLit(s), _)) => assert_eq!(s, ""),
1338            other => panic!("Expected empty StringLit, got {:?}", other),
1339        }
1340    }
1341
1342    #[test]
1343    fn test_escape_sequence_helper() {
1344        // Unit test the helper function directly
1345        assert_eq!(process_escape_sequences(r"hello\nworld"), "hello\nworld");
1346        assert_eq!(process_escape_sequences(r"hello\tworld"), "hello\tworld");
1347        assert_eq!(process_escape_sequences(r"hello\\world"), "hello\\world");
1348        assert_eq!(process_escape_sequences(r#"hello\"world"#), "hello\"world");
1349        assert_eq!(process_escape_sequences(r"hello\x41world"), "helloAworld");
1350        assert_eq!(
1351            process_escape_sequences(r"hello\u{1F600}world"),
1352            "hello😀world"
1353        );
1354    }
1355}
sigil_parser/lexer.rs

sigil_parser/
lexer.rs