sigil_parser/
lexer.rs

1//! Lexer for the Sigil programming language.
2//!
3//! Handles polysynthetic morphemes, evidentiality markers, and multi-base numerals.
4
5use logos::Logos;
6use crate::span::Span;
7
8/// Process escape sequences in a string literal.
9/// Converts \n, \t, \r, \\, \", \', \0, \xNN, \u{NNNN} to their actual characters.
10fn process_escape_sequences(s: &str) -> String {
11    let mut result = String::with_capacity(s.len());
12    let mut chars = s.chars().peekable();
13
14    while let Some(c) = chars.next() {
15        if c == '\\' {
16            match chars.next() {
17                Some('n') => result.push('\n'),
18                Some('t') => result.push('\t'),
19                Some('r') => result.push('\r'),
20                Some('\\') => result.push('\\'),
21                Some('"') => result.push('"'),
22                Some('\'') => result.push('\''),
23                Some('0') => result.push('\0'),
24                Some('x') => {
25                    // \xNN - two hex digits
26                    let mut hex = String::new();
27                    for _ in 0..2 {
28                        if let Some(&c) = chars.peek() {
29                            if c.is_ascii_hexdigit() {
30                                hex.push(chars.next().unwrap());
31                            }
32                        }
33                    }
34                    if let Ok(val) = u8::from_str_radix(&hex, 16) {
35                        result.push(val as char);
36                    }
37                }
38                Some('u') => {
39                    // \u{NNNN} - Unicode code point
40                    if chars.peek() == Some(&'{') {
41                        chars.next(); // consume '{'
42                        let mut hex = String::new();
43                        while let Some(&c) = chars.peek() {
44                            if c == '}' {
45                                chars.next();
46                                break;
47                            }
48                            if c.is_ascii_hexdigit() {
49                                hex.push(chars.next().unwrap());
50                            } else {
51                                break;
52                            }
53                        }
54                        if let Ok(val) = u32::from_str_radix(&hex, 16) {
55                            if let Some(c) = char::from_u32(val) {
56                                result.push(c);
57                            }
58                        }
59                    }
60                }
61                Some(other) => {
62                    // Unknown escape, keep as-is
63                    result.push('\\');
64                    result.push(other);
65                }
66                None => result.push('\\'),
67            }
68        } else {
69            result.push(c);
70        }
71    }
72    result
73}
74
75/// Callback for delimited raw strings (r#"..."#).
76/// Reads until the closing "# is found.
77fn raw_string_delimited_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
78    let remainder = lex.remainder();
79
80    // Find the closing "#
81    if let Some(end_pos) = remainder.find("\"#") {
82        let content = &remainder[..end_pos];
83        // Bump past content and closing "# (2 chars)
84        lex.bump(end_pos + 2);
85        Some(content.to_string())
86    } else {
87        None
88    }
89}
90
91/// Callback for multi-line string literals.
92/// Reads from """ until the next """ is found.
93fn multiline_string_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
94    let remainder = lex.remainder();
95
96    // Find the closing """
97    if let Some(end_pos) = remainder.find("\"\"\"") {
98        let content = &remainder[..end_pos];
99        // Bump the lexer past the content and closing quotes
100        lex.bump(end_pos + 3);
101        Some(process_escape_sequences(content))
102    } else {
103        // No closing """ found - skip to end and return what we have
104        None
105    }
106}
107
108/// Process escape sequences in a character literal.
109fn process_char_escape(s: &str) -> char {
110    let mut chars = s.chars();
111    match chars.next() {
112        Some('\\') => {
113            match chars.next() {
114                Some('n') => '\n',
115                Some('t') => '\t',
116                Some('r') => '\r',
117                Some('\\') => '\\',
118                Some('"') => '"',
119                Some('\'') => '\'',
120                Some('0') => '\0',
121                Some('x') => {
122                    let hex: String = chars.take(2).collect();
123                    u8::from_str_radix(&hex, 16).map(|v| v as char).unwrap_or('?')
124                }
125                Some('u') => {
126                    if chars.next() == Some('{') {
127                        let hex: String = chars.take_while(|&c| c != '}').collect();
128                        u32::from_str_radix(&hex, 16)
129                            .ok()
130                            .and_then(char::from_u32)
131                            .unwrap_or('?')
132                    } else {
133                        '?'
134                    }
135                }
136                Some(c) => c,
137                None => '?',
138            }
139        }
140        Some(c) => c,
141        None => '?',
142    }
143}
144
145/// Token types for Sigil.
146#[derive(Logos, Debug, Clone, PartialEq)]
147#[logos(skip r"[ \t\r\n\f]+")]
148pub enum Token {
149    // === Comments ===
150    #[regex(r"//[^\n]*", |lex| lex.slice().to_string())]
151    LineComment(String),
152
153    #[regex(r"//![^\n]*", |lex| lex.slice().to_string())]
154    DocComment(String),
155
156    // === Keywords ===
157    #[token("fn")]
158    Fn,
159    #[token("async")]
160    Async,
161    #[token("let")]
162    Let,
163    #[token("mut")]
164    Mut,
165    #[token("const")]
166    Const,
167    #[token("type")]
168    Type,
169    #[token("struct")]
170    Struct,
171    #[token("enum")]
172    Enum,
173    #[token("trait")]
174    Trait,
175    #[token("impl")]
176    Impl,
177    #[token("mod")]
178    Mod,
179    #[token("use")]
180    Use,
181    #[token("pub")]
182    Pub,
183    #[token("actor")]
184    Actor,
185    #[token("saga")]
186    Saga,
187    #[token("scope")]
188    Scope,
189    #[token("rune")]
190    Rune,
191
192    // Control flow
193    #[token("if")]
194    If,
195    #[token("else")]
196    Else,
197    #[token("match")]
198    Match,
199    #[token("loop")]
200    Loop,
201    #[token("while")]
202    While,
203    #[token("for")]
204    For,
205    #[token("in")]
206    In,
207    #[token("break")]
208    Break,
209    #[token("continue")]
210    Continue,
211    #[token("return")]
212    Return,
213    #[token("yield")]
214    Yield,
215    #[token("await")]
216    Await,
217
218    // Other keywords
219    #[token("self")]
220    SelfLower,
221    #[token("Self")]
222    SelfUpper,
223    #[token("super")]
224    Super,
225    #[token("crate")]
226    Crate,
227    #[token("where")]
228    Where,
229    #[token("as")]
230    As,
231    #[token("dyn")]
232    Dyn,
233    #[token("move")]
234    Move,
235    #[token("ref")]
236    Ref,
237    #[token("static")]
238    Static,
239    #[token("unsafe")]
240    Unsafe,
241    #[token("extern")]
242    Extern,
243    #[token("asm")]
244    Asm,
245    #[token("volatile")]
246    Volatile,
247    #[token("naked")]
248    Naked,
249    #[token("packed")]
250    Packed,
251    #[token("simd")]
252    Simd,
253    #[token("atomic")]
254    Atomic,
255    #[token("derive")]
256    Derive,
257    #[token("on")]
258    On,
259
260    // Boolean literals
261    #[token("true")]
262    True,
263    #[token("false")]
264    False,
265
266    // Null literal
267    #[token("null")]
268    Null,
269
270    // === Morphemes (Greek letters) ===
271    #[token("τ")]
272    #[token("Τ")]
273    Tau,  // Transform/map
274
275    #[token("φ")]
276    #[token("Φ")]
277    Phi,  // Filter
278
279    #[token("σ")]
280    #[token("Σ")]
281    Sigma,  // Sort (lowercase) / Sum (uppercase)
282
283    #[token("ρ")]
284    #[token("Ρ")]
285    Rho,  // Reduce
286
287    #[token("λ")]
288    #[token("Λ")]
289    Lambda,  // Lambda
290
291    #[token("Π")]
292    Pi,  // Product
293
294    #[token("⌛")]
295    Hourglass,  // Await symbol
296
297    // Additional morphemes
298    #[token("δ")]
299    #[token("Δ")]
300    Delta,  // Difference/change
301
302    #[token("ε")]
303    Epsilon,  // Empty/null
304
305    #[token("ω")]
306    #[token("Ω")]
307    Omega,  // End/terminal
308
309    #[token("α")]
310    Alpha,  // First element
311
312    #[token("ζ")]
313    Zeta,  // Zip/combine
314
315    // === Additional Access Morphemes ===
316    #[token("μ")]
317    #[token("Μ")]
318    Mu,  // Middle/median element
319
320    #[token("χ")]
321    #[token("Χ")]
322    Chi,  // Random/choice (from chaos)
323
324    #[token("ν")]
325    #[token("Ν")]
326    Nu,  // Nth element (ordinal)
327
328    #[token("ξ")]
329    #[token("Ξ")]
330    Xi,  // Next in sequence
331
332    // === Parallel/Concurrency Morphemes ===
333    #[token("∥")]
334    #[token("parallel")]
335    Parallel,  // Parallel execution (U+2225)
336
337    #[token("⊛")]
338    #[token("gpu")]
339    Gpu,  // GPU compute shader (U+229B - circled asterisk)
340
341    // === Quantifiers (for AI-native set operations) ===
342    #[token("∀")]
343    ForAll,  // Universal quantification
344
345    #[token("∃")]
346    Exists,  // Existential quantification
347
348    #[token("∈")]
349    ElementOf,  // Membership test
350
351    #[token("∉")]
352    NotElementOf,  // Non-membership
353
354    // === Set Operations ===
355    #[token("∪")]
356    Union,  // Set union
357
358    #[token("∩")]
359    Intersection,  // Set intersection
360
361    #[token("∖")]
362    SetMinus,  // Set difference
363
364    #[token("⊂")]
365    Subset,  // Proper subset
366
367    #[token("⊆")]
368    SubsetEq,  // Subset or equal
369
370    #[token("⊃")]
371    Superset,  // Proper superset
372
373    #[token("⊇")]
374    SupersetEq,  // Superset or equal
375
376    // === Logic Operators ===
377    #[token("∧")]
378    LogicAnd,  // Logical conjunction
379
380    #[token("∨")]
381    LogicOr,  // Logical disjunction
382
383    #[token("¬")]
384    LogicNot,  // Logical negation
385
386    #[token("⊻")]
387    LogicXor,  // Exclusive or
388
389    #[token("⊤")]
390    Top,  // True/any type
391
392    #[token("⊥")]
393    Bottom,  // False/never type
394
395    // === Bitwise Operators (Unicode) ===
396    #[token("⋏")]
397    BitwiseAndSymbol,  // Bitwise AND (U+22CF)
398
399    #[token("⋎")]
400    BitwiseOrSymbol,  // Bitwise OR (U+22CE)
401
402    // === Type Theory ===
403    #[token("∷")]
404    TypeAnnotation,  // Type annotation (alternative to :)
405
406    // === Analysis/Calculus ===
407    #[token("∫")]
408    Integral,  // Cumulative sum
409
410    #[token("∂")]
411    Partial,  // Discrete derivative
412
413    #[token("√")]
414    Sqrt,  // Square root
415
416    #[token("∛")]
417    Cbrt,  // Cube root
418
419    // === Category Theory ===
420    #[token("∘")]
421    Compose,  // Function composition
422
423    #[token("⊗")]
424    Tensor,  // Tensor product
425
426    #[token("⊕")]
427    DirectSum,  // Direct sum / XOR
428
429    // === Data Operations ===
430    #[token("⋈")]
431    Bowtie,  // Join/zip combining (U+22C8)
432
433    #[token("⋳")]
434    ElementSmallVerticalBar,  // Flatten (U+22F3)
435
436    #[token("⊔")]
437    SquareCup,  // Lattice join / supremum (U+2294)
438
439    #[token("⊓")]
440    SquareCap,  // Lattice meet / infimum (U+2293)
441
442    // === Evidentiality Markers ===
443    // Note: These are handled contextually since ! and ? have other uses
444    #[token("‽")]
445    Interrobang,  // Paradox/trust boundary
446
447    // === Affective Markers (Sentiment & Emotion) ===
448    // Sentiment polarity
449    #[token("⊖")]
450    AffectNegative,  // Negative sentiment (U+2296 Circled Minus)
451
452    #[token("⊜")]
453    AffectNeutral,  // Neutral sentiment (U+229C Circled Equals)
454
455    // Note: ⊕ (U+2295) is already DirectSum - we'll use it dual-purpose for positive sentiment
456
457    // Sarcasm/Irony
458    #[token("⸮")]
459    IronyMark,  // Irony/sarcasm marker (U+2E2E - historical percontation point!)
460
461    // Intensity modifiers
462    #[token("↑")]
463    IntensityUp,  // Intensifier (U+2191)
464
465    #[token("↓")]
466    IntensityDown,  // Dampener (U+2193)
467
468    #[token("⇈")]
469    IntensityMax,  // Maximum intensity (U+21C8)
470
471    // Formality register
472    #[token("♔")]
473    FormalRegister,  // Formal (U+2654 White King)
474
475    #[token("♟")]
476    InformalRegister,  // Informal (U+265F Black Pawn)
477
478    // Emotion markers (Plutchik's wheel)
479    #[token("☺")]
480    EmotionJoy,  // Joy (U+263A)
481
482    #[token("☹")]
483    EmotionSadness,  // Sadness (U+2639)
484
485    #[token("⚡")]
486    EmotionAnger,  // Anger (U+26A1)
487
488    #[token("❄")]
489    EmotionFear,  // Fear (U+2744)
490
491    #[token("✦")]
492    EmotionSurprise,  // Surprise (U+2726)
493
494    #[token("♡")]
495    EmotionLove,  // Love/Trust (U+2661)
496
497    // Confidence markers
498    #[token("◉")]
499    ConfidenceHigh,  // High confidence (U+25C9)
500
501    #[token("◎")]
502    ConfidenceMedium,  // Medium confidence (U+25CE)
503
504    #[token("○")]
505    ConfidenceLow,  // Low confidence (U+25CB)
506
507    // === Aspect Morphemes (verb aspects) ===
508    #[token("·ing")]
509    AspectProgressive,  // Ongoing/streaming aspect
510
511    #[token("·ed")]
512    AspectPerfective,  // Completed aspect
513
514    #[token("·able")]
515    AspectPotential,  // Capability aspect
516
517    #[token("·ive")]
518    AspectResultative,  // Result-producing aspect
519
520    // === Operators ===
521    #[token("|")]
522    Pipe,
523    #[token("·")]
524    MiddleDot,  // Incorporation
525    #[token("->")]
526    Arrow,
527    #[token("=>")]
528    FatArrow,
529    #[token("<-")]
530    LeftArrow,
531    #[token("==")]
532    EqEq,
533    #[token("!=")]
534    NotEq,
535    #[token("<=")]
536    LtEq,
537    #[token(">=")]
538    GtEq,
539    #[token("<")]
540    Lt,
541    #[token(">")]
542    Gt,
543    #[token("+")]
544    Plus,
545    #[token("-")]
546    Minus,
547    #[token("*")]
548    Star,
549    #[token("/")]
550    Slash,
551    #[token("%")]
552    Percent,
553    #[token("**")]
554    StarStar,  // Exponentiation
555    #[token("&&")]
556    AndAnd,
557    #[token("||")]
558    OrOr,
559    #[token("!")]
560    Bang,  // Evidentiality: known / logical not
561    #[token("?")]
562    Question,  // Evidentiality: uncertain / try
563    #[token("~")]
564    Tilde,  // Evidentiality: reported
565    #[token("&")]
566    Amp,
567    #[token("^")]
568    Caret,
569    #[token("<<")]
570    Shl,
571    #[token(">>")]
572    Shr,
573    #[token("=")]
574    Eq,
575    #[token("+=")]
576    PlusEq,
577    #[token("-=")]
578    MinusEq,
579    #[token("*=")]
580    StarEq,
581    #[token("/=")]
582    SlashEq,
583    #[token("..")]
584    DotDot,
585    #[token("..=")]
586    DotDotEq,
587    #[token("++")]
588    PlusPlus,  // Concatenation
589    #[token("::")]
590    ColonColon,
591    #[token(":")]
592    Colon,
593    #[token(";")]
594    Semi,
595    #[token(",")]
596    Comma,
597    #[token(".")]
598    Dot,
599    #[token("@")]
600    At,
601    #[token("#!")]
602    HashBang,  // Inner attribute prefix #![...]
603    #[token("#")]
604    Hash,
605    #[token("_", priority = 3)]
606    Underscore,
607
608    // === Delimiters ===
609    #[token("(")]
610    LParen,
611    #[token(")")]
612    RParen,
613    #[token("{")]
614    LBrace,
615    #[token("}")]
616    RBrace,
617    #[token("[")]
618    LBracket,
619    #[token("]")]
620    RBracket,
621
622    // === Special symbols ===
623    #[token("∅")]
624    Empty,  // Void/emptiness (śūnya)
625    #[token("◯")]
626    Circle,  // Geometric zero
627    #[token("∞")]
628    Infinity,  // Ananta
629
630    // === Protocol Operations (Sigil-native networking) ===
631    #[token("⇒")]
632    ProtoSend,  // Send data (U+21D2 - rightwards double arrow)
633
634    #[token("⇐")]
635    ProtoRecv,  // Receive data (U+21D0 - leftwards double arrow)
636
637    #[token("≋")]
638    ProtoStream,  // Stream data (U+224B - triple tilde)
639
640    #[token("⊸")]
641    ProtoConnect,  // Connect/lollipop (U+22B8 - multimap)
642
643    #[token("⏱")]
644    ProtoTimeout,  // Timeout (U+23F1 - stopwatch)
645
646    // Note: ⊗ (Tensor) is used for close in protocol contexts
647
648    // Protocol keywords for ASCII fallback
649    #[token("send")]
650    Send,
651    #[token("recv")]
652    Recv,
653    #[token("stream")]
654    Stream,
655    #[token("connect")]
656    Connect,
657    #[token("close")]
658    Close,
659    #[token("timeout")]
660    Timeout,
661    #[token("retry")]
662    Retry,
663    #[token("header")]
664    Header,
665    #[token("body")]
666    Body,
667
668    // Protocol type identifiers (for incorporation: http·, ws·, grpc·, kafka·)
669    #[token("http")]
670    Http,
671    #[token("https")]
672    Https,
673    #[token("ws")]
674    Ws,
675    #[token("wss")]
676    Wss,
677    #[token("grpc")]
678    Grpc,
679    #[token("kafka")]
680    Kafka,
681    #[token("amqp")]
682    Amqp,
683    #[token("graphql")]
684    GraphQL,
685
686    // === Numbers ===
687    // Binary: 0b...
688    #[regex(r"0b[01_]+", |lex| lex.slice().to_string())]
689    BinaryLit(String),
690
691    // Octal: 0o...
692    #[regex(r"0o[0-7_]+", |lex| lex.slice().to_string())]
693    OctalLit(String),
694
695    // Hex: 0x...
696    #[regex(r"0x[0-9a-fA-F_]+", |lex| lex.slice().to_string())]
697    HexLit(String),
698
699    // Vigesimal: 0v... (base 20)
700    #[regex(r"0v[0-9a-jA-J_]+", |lex| lex.slice().to_string())]
701    VigesimalLit(String),
702
703    // Sexagesimal: 0s... (base 60)
704    #[regex(r"0s[0-9a-zA-Z_]+", |lex| lex.slice().to_string())]
705    SexagesimalLit(String),
706
707    // Duodecimal: 0z... (base 12)
708    #[regex(r"0z[0-9a-bA-B_]+", |lex| lex.slice().to_string())]
709    DuodecimalLit(String),
710
711    // Float: 123.456 or 1.23e10
712    #[regex(r"[0-9][0-9_]*\.[0-9][0-9_]*([eE][+-]?[0-9_]+)?", |lex| lex.slice().to_string())]
713    FloatLit(String),
714
715    // Integer: 123
716    #[regex(r"[0-9][0-9_]*", |lex| lex.slice().to_string())]
717    IntLit(String),
718
719    // === Strings ===
720    // Regular string with escape sequence processing
721    #[regex(r#""([^"\\]|\\.)*""#, |lex| {
722        let s = lex.slice();
723        let inner = &s[1..s.len()-1];
724        process_escape_sequences(inner)
725    })]
726    StringLit(String),
727
728    // Multi-line string (triple-quoted) - handled via callback
729    #[token(r#"""""#, multiline_string_callback)]
730    MultiLineStringLit(String),
731
732    // Byte string literal
733    #[regex(r#"b"([^"\\]|\\.)*""#, |lex| {
734        let s = lex.slice();
735        let inner = &s[2..s.len()-1];
736        inner.as_bytes().to_vec()
737    })]
738    ByteStringLit(Vec<u8>),
739
740    // Interpolated string (will be parsed further for expressions)
741    #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
742        let s = lex.slice();
743        let inner = &s[2..s.len()-1];
744        process_escape_sequences(inner)
745    })]
746    InterpolatedStringLit(String),
747
748    // Sigil string - SQL template (σ prefix)
749    #[regex(r#"σ"([^"\\]|\\.)*""#, |lex| {
750        let s = lex.slice();
751        // Get byte index after the σ character (which is 2 bytes in UTF-8)
752        let start = "σ".len() + 1; // σ + opening quote
753        let inner = &s[start..s.len()-1];
754        process_escape_sequences(inner)
755    })]
756    SigilStringSql(String),
757
758    // Sigil string - Route template (ρ prefix)
759    #[regex(r#"ρ"([^"\\]|\\.)*""#, |lex| {
760        let s = lex.slice();
761        // Get byte index after the ρ character (which is 2 bytes in UTF-8)
762        let start = "ρ".len() + 1; // ρ + opening quote
763        let inner = &s[start..s.len()-1];
764        process_escape_sequences(inner)
765    })]
766    SigilStringRoute(String),
767
768    // Char literal with escape sequence processing
769    #[regex(r"'([^'\\]|\\.)'", |lex| {
770        let s = lex.slice();
771        let inner = &s[1..s.len()-1];
772        process_char_escape(inner)
773    })]
774    CharLit(char),
775
776    // Raw string (no escape processing)
777    #[regex(r#"r"[^"]*""#, |lex| {
778        let s = lex.slice();
779        s[2..s.len()-1].to_string()
780    })]
781    RawStringLit(String),
782
783    // Raw string with delimiter (r#"..."# style) - handles internal quotes
784    #[token(r##"r#""##, raw_string_delimited_callback)]
785    RawStringDelimited(String),
786
787    // === Identifiers ===
788    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
789    Ident(String),
790
791    // === Rune annotation ===
792    #[regex(r"//@\s*rune:\s*[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
793    RuneAnnotation(String),
794}
795
796impl Token {
797    pub fn is_keyword(&self) -> bool {
798        matches!(
799            self,
800            Token::Fn | Token::Async | Token::Let | Token::Mut | Token::Const |
801            Token::Type | Token::Struct | Token::Enum | Token::Trait | Token::Impl |
802            Token::Mod | Token::Use | Token::Pub | Token::Actor | Token::Saga |
803            Token::Scope | Token::Rune | Token::If | Token::Else | Token::Match |
804            Token::Loop | Token::While | Token::For | Token::In | Token::Break |
805            Token::Continue | Token::Return | Token::Yield | Token::Await
806        )
807    }
808
809    pub fn is_morpheme(&self) -> bool {
810        matches!(
811            self,
812            Token::Tau | Token::Phi | Token::Sigma | Token::Rho |
813            Token::Lambda | Token::Pi | Token::Hourglass |
814            Token::Delta | Token::Epsilon | Token::Omega | Token::Alpha | Token::Zeta |
815            Token::Mu | Token::Chi | Token::Nu | Token::Xi |  // Access morphemes
816            Token::Parallel | Token::Gpu |  // Concurrency morphemes
817            Token::Integral | Token::Partial | Token::Sqrt | Token::Cbrt |
818            Token::Compose
819        )
820    }
821
822    pub fn is_aspect(&self) -> bool {
823        matches!(
824            self,
825            Token::AspectProgressive | Token::AspectPerfective |
826            Token::AspectPotential | Token::AspectResultative
827        )
828    }
829
830    pub fn is_data_op(&self) -> bool {
831        matches!(
832            self,
833            Token::Bowtie | Token::ElementSmallVerticalBar |
834            Token::SquareCup | Token::SquareCap
835        )
836    }
837
838    pub fn is_bitwise_symbol(&self) -> bool {
839        matches!(
840            self,
841            Token::BitwiseAndSymbol | Token::BitwiseOrSymbol
842        )
843    }
844
845    pub fn is_quantifier(&self) -> bool {
846        matches!(
847            self,
848            Token::ForAll | Token::Exists | Token::ElementOf | Token::NotElementOf
849        )
850    }
851
852    pub fn is_set_op(&self) -> bool {
853        matches!(
854            self,
855            Token::Union | Token::Intersection | Token::SetMinus |
856            Token::Subset | Token::SubsetEq | Token::Superset | Token::SupersetEq
857        )
858    }
859
860    pub fn is_logic_op(&self) -> bool {
861        matches!(
862            self,
863            Token::LogicAnd | Token::LogicOr | Token::LogicNot | Token::LogicXor |
864            Token::Top | Token::Bottom
865        )
866    }
867
868    pub fn is_evidentiality(&self) -> bool {
869        matches!(
870            self,
871            Token::Bang | Token::Question | Token::Tilde | Token::Interrobang
872        )
873    }
874
875    pub fn is_affective(&self) -> bool {
876        matches!(
877            self,
878            // Sentiment
879            Token::DirectSum |  // ⊕ positive (dual-purpose with DirectSum)
880            Token::AffectNegative |  // ⊖ negative
881            Token::AffectNeutral |  // ⊜ neutral
882            // Sarcasm
883            Token::IronyMark |  // ⸮ irony/sarcasm
884            // Intensity
885            Token::IntensityUp |  // ↑
886            Token::IntensityDown |  // ↓
887            Token::IntensityMax |  // ⇈
888            // Formality
889            Token::FormalRegister |  // ♔
890            Token::InformalRegister |  // ♟
891            // Emotions
892            Token::EmotionJoy |  // ☺
893            Token::EmotionSadness |  // ☹
894            Token::EmotionAnger |  // ⚡
895            Token::EmotionFear |  // ❄
896            Token::EmotionSurprise |  // ✦
897            Token::EmotionLove |  // ♡
898            // Confidence
899            Token::ConfidenceHigh |  // ◉
900            Token::ConfidenceMedium |  // ◎
901            Token::ConfidenceLow  // ○
902        )
903    }
904
905    pub fn is_sentiment(&self) -> bool {
906        matches!(
907            self,
908            Token::DirectSum | Token::AffectNegative | Token::AffectNeutral
909        )
910    }
911
912    pub fn is_emotion(&self) -> bool {
913        matches!(
914            self,
915            Token::EmotionJoy | Token::EmotionSadness | Token::EmotionAnger |
916            Token::EmotionFear | Token::EmotionSurprise | Token::EmotionLove
917        )
918    }
919
920    pub fn is_intensity(&self) -> bool {
921        matches!(
922            self,
923            Token::IntensityUp | Token::IntensityDown | Token::IntensityMax
924        )
925    }
926}
927
928/// Lexer wrapping Logos for Sigil.
929pub struct Lexer<'a> {
930    inner: logos::Lexer<'a, Token>,
931    peeked: Option<Option<(Token, Span)>>,
932}
933
934impl<'a> Lexer<'a> {
935    pub fn new(source: &'a str) -> Self {
936        Self {
937            inner: Token::lexer(source),
938            peeked: None,
939        }
940    }
941
942    pub fn next_token(&mut self) -> Option<(Token, Span)> {
943        if let Some(peeked) = self.peeked.take() {
944            return peeked;
945        }
946
947        match self.inner.next() {
948            Some(Ok(token)) => {
949                let span = self.inner.span();
950                Some((token, Span::new(span.start, span.end)))
951            }
952            Some(Err(_)) => {
953                // Skip invalid tokens and try next
954                self.next_token()
955            }
956            None => None,
957        }
958    }
959
960    pub fn peek(&mut self) -> Option<&(Token, Span)> {
961        if self.peeked.is_none() {
962            self.peeked = Some(self.next_token());
963        }
964        self.peeked.as_ref().and_then(|p| p.as_ref())
965    }
966
967    pub fn span(&self) -> Span {
968        let span = self.inner.span();
969        Span::new(span.start, span.end)
970    }
971}
972
973#[cfg(test)]
974mod tests {
975    use super::*;
976
977    #[test]
978    fn test_morphemes() {
979        let mut lexer = Lexer::new("τ φ σ ρ λ Σ Π ⌛");
980        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
981        assert!(matches!(lexer.next_token(), Some((Token::Phi, _))));
982        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
983        assert!(matches!(lexer.next_token(), Some((Token::Rho, _))));
984        assert!(matches!(lexer.next_token(), Some((Token::Lambda, _))));
985        assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
986        assert!(matches!(lexer.next_token(), Some((Token::Pi, _))));
987        assert!(matches!(lexer.next_token(), Some((Token::Hourglass, _))));
988    }
989
990    #[test]
991    fn test_evidentiality() {
992        let mut lexer = Lexer::new("value! uncertain? reported~ paradox‽");
993        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "value"));
994        assert!(matches!(lexer.next_token(), Some((Token::Bang, _))));
995        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "uncertain"));
996        assert!(matches!(lexer.next_token(), Some((Token::Question, _))));
997        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "reported"));
998        assert!(matches!(lexer.next_token(), Some((Token::Tilde, _))));
999        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "paradox"));
1000        assert!(matches!(lexer.next_token(), Some((Token::Interrobang, _))));
1001    }
1002
1003    #[test]
1004    fn test_pipe_chain() {
1005        let mut lexer = Lexer::new("data|τ{f}|φ{p}|σ");
1006        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "data"));
1007        assert!(matches!(lexer.next_token(), Some((Token::Pipe, _))));
1008        assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1009        assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1010        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1011        assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1012    }
1013
1014    #[test]
1015    fn test_numbers() {
1016        let mut lexer = Lexer::new("42 0b1010 0o52 0x2A 0v22 0s42 3.14");
1017        assert!(matches!(lexer.next_token(), Some((Token::IntLit(s), _)) if s == "42"));
1018        assert!(matches!(lexer.next_token(), Some((Token::BinaryLit(s), _)) if s == "0b1010"));
1019        assert!(matches!(lexer.next_token(), Some((Token::OctalLit(s), _)) if s == "0o52"));
1020        assert!(matches!(lexer.next_token(), Some((Token::HexLit(s), _)) if s == "0x2A"));
1021        assert!(matches!(lexer.next_token(), Some((Token::VigesimalLit(s), _)) if s == "0v22"));
1022        assert!(matches!(lexer.next_token(), Some((Token::SexagesimalLit(s), _)) if s == "0s42"));
1023        assert!(matches!(lexer.next_token(), Some((Token::FloatLit(s), _)) if s == "3.14"));
1024    }
1025
1026    #[test]
1027    fn test_incorporation() {
1028        let mut lexer = Lexer::new("file·open·read");
1029        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "file"));
1030        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1031        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "open"));
1032        assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1033        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "read"));
1034    }
1035
1036    #[test]
1037    fn test_special_symbols() {
1038        let mut lexer = Lexer::new("∅ ◯ ∞");
1039        assert!(matches!(lexer.next_token(), Some((Token::Empty, _))));
1040        assert!(matches!(lexer.next_token(), Some((Token::Circle, _))));
1041        assert!(matches!(lexer.next_token(), Some((Token::Infinity, _))));
1042    }
1043
1044    #[test]
1045    fn test_quantifiers() {
1046        let mut lexer = Lexer::new("∀x ∃y x∈S y∉T");
1047        assert!(matches!(lexer.next_token(), Some((Token::ForAll, _))));
1048        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1049        assert!(matches!(lexer.next_token(), Some((Token::Exists, _))));
1050        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "y"));
1051        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1052        assert!(matches!(lexer.next_token(), Some((Token::ElementOf, _))));
1053    }
1054
1055    #[test]
1056    fn test_set_operations() {
1057        let mut lexer = Lexer::new("A∪B A∩B A∖B A⊂B A⊆B");
1058        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1059        assert!(matches!(lexer.next_token(), Some((Token::Union, _))));
1060        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "B"));
1061        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1062        assert!(matches!(lexer.next_token(), Some((Token::Intersection, _))));
1063    }
1064
1065    #[test]
1066    fn test_logic_operators() {
1067        let mut lexer = Lexer::new("p∧q p∨q ¬p p⊻q ⊤ ⊥");
1068        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1069        assert!(matches!(lexer.next_token(), Some((Token::LogicAnd, _))));
1070        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "q"));
1071        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1072        assert!(matches!(lexer.next_token(), Some((Token::LogicOr, _))));
1073    }
1074
1075    #[test]
1076    fn test_analysis_operators() {
1077        let mut lexer = Lexer::new("∫f ∂g √x ∛y f∘g");
1078        assert!(matches!(lexer.next_token(), Some((Token::Integral, _))));
1079        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1080        assert!(matches!(lexer.next_token(), Some((Token::Partial, _))));
1081        assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "g"));
1082        assert!(matches!(lexer.next_token(), Some((Token::Sqrt, _))));
1083    }
1084
1085    #[test]
1086    fn test_additional_morphemes() {
1087        let mut lexer = Lexer::new("δ ε ω α ζ");
1088        assert!(matches!(lexer.next_token(), Some((Token::Delta, _))));
1089        assert!(matches!(lexer.next_token(), Some((Token::Epsilon, _))));
1090        assert!(matches!(lexer.next_token(), Some((Token::Omega, _))));
1091        assert!(matches!(lexer.next_token(), Some((Token::Alpha, _))));
1092        assert!(matches!(lexer.next_token(), Some((Token::Zeta, _))));
1093    }
1094
1095    #[test]
1096    fn test_ffi_keywords() {
1097        let mut lexer = Lexer::new("extern unsafe");
1098        assert!(matches!(lexer.next_token(), Some((Token::Extern, _))));
1099        assert!(matches!(lexer.next_token(), Some((Token::Unsafe, _))));
1100    }
1101
1102    #[test]
1103    fn test_parallel_morphemes() {
1104        let mut lexer = Lexer::new("∥ parallel ⊛ gpu");
1105        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1106        assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1107        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1108        assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1109    }
1110
1111    // ==================== STRING LITERAL TESTS ====================
1112
1113    #[test]
1114    fn test_string_escape_sequences() {
1115        // Test basic escape sequences
1116        let mut lexer = Lexer::new(r#""hello\nworld""#);
1117        match lexer.next_token() {
1118            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\nworld"),
1119            other => panic!("Expected StringLit, got {:?}", other),
1120        }
1121
1122        // Test tab escape
1123        let mut lexer = Lexer::new(r#""hello\tworld""#);
1124        match lexer.next_token() {
1125            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\tworld"),
1126            other => panic!("Expected StringLit, got {:?}", other),
1127        }
1128
1129        // Test carriage return
1130        let mut lexer = Lexer::new(r#""hello\rworld""#);
1131        match lexer.next_token() {
1132            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\rworld"),
1133            other => panic!("Expected StringLit, got {:?}", other),
1134        }
1135
1136        // Test escaped backslash
1137        let mut lexer = Lexer::new(r#""hello\\world""#);
1138        match lexer.next_token() {
1139            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\\world"),
1140            other => panic!("Expected StringLit, got {:?}", other),
1141        }
1142
1143        // Test escaped quote
1144        let mut lexer = Lexer::new(r#""hello\"world""#);
1145        match lexer.next_token() {
1146            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\"world"),
1147            other => panic!("Expected StringLit, got {:?}", other),
1148        }
1149
1150        // Test null character
1151        let mut lexer = Lexer::new(r#""hello\0world""#);
1152        match lexer.next_token() {
1153            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\0world"),
1154            other => panic!("Expected StringLit, got {:?}", other),
1155        }
1156    }
1157
1158    #[test]
1159    fn test_string_hex_escape() {
1160        // Test \xNN hex escape
1161        let mut lexer = Lexer::new(r#""hello\x41world""#);
1162        match lexer.next_token() {
1163            Some((Token::StringLit(s), _)) => assert_eq!(s, "helloAworld"),
1164            other => panic!("Expected StringLit, got {:?}", other),
1165        }
1166    }
1167
1168    #[test]
1169    fn test_string_unicode_escape() {
1170        // Test \u{NNNN} Unicode escape
1171        let mut lexer = Lexer::new(r#""hello\u{1F600}world""#);
1172        match lexer.next_token() {
1173            Some((Token::StringLit(s), _)) => assert_eq!(s, "hello😀world"),
1174            other => panic!("Expected StringLit, got {:?}", other),
1175        }
1176
1177        // Test Greek letter
1178        let mut lexer = Lexer::new(r#""\u{03C4}""#);
1179        match lexer.next_token() {
1180            Some((Token::StringLit(s), _)) => assert_eq!(s, "τ"),
1181            other => panic!("Expected StringLit, got {:?}", other),
1182        }
1183    }
1184
1185    #[test]
1186    fn test_char_escape_sequences() {
1187        let mut lexer = Lexer::new(r"'\n'");
1188        match lexer.next_token() {
1189            Some((Token::CharLit(c), _)) => assert_eq!(c, '\n'),
1190            other => panic!("Expected CharLit, got {:?}", other),
1191        }
1192
1193        let mut lexer = Lexer::new(r"'\t'");
1194        match lexer.next_token() {
1195            Some((Token::CharLit(c), _)) => assert_eq!(c, '\t'),
1196            other => panic!("Expected CharLit, got {:?}", other),
1197        }
1198
1199        let mut lexer = Lexer::new(r"'\\'");
1200        match lexer.next_token() {
1201            Some((Token::CharLit(c), _)) => assert_eq!(c, '\\'),
1202            other => panic!("Expected CharLit, got {:?}", other),
1203        }
1204    }
1205
1206    #[test]
1207    fn test_raw_string() {
1208        // Raw strings should NOT process escapes
1209        let mut lexer = Lexer::new(r#"r"hello\nworld""#);
1210        match lexer.next_token() {
1211            Some((Token::RawStringLit(s), _)) => assert_eq!(s, r"hello\nworld"),
1212            other => panic!("Expected RawStringLit, got {:?}", other),
1213        }
1214    }
1215
1216    #[test]
1217    fn test_raw_string_delimited() {
1218        // r#"..."# style
1219        let mut lexer = Lexer::new(r##"r#"hello "world""#"##);
1220        match lexer.next_token() {
1221            Some((Token::RawStringDelimited(s), _)) => assert_eq!(s, r#"hello "world""#),
1222            other => panic!("Expected RawStringDelimited, got {:?}", other),
1223        }
1224    }
1225
1226    #[test]
1227    fn test_byte_string() {
1228        let mut lexer = Lexer::new(r#"b"hello""#);
1229        match lexer.next_token() {
1230            Some((Token::ByteStringLit(bytes), _)) => {
1231                assert_eq!(bytes, vec![104, 101, 108, 108, 111]); // "hello" in ASCII
1232            }
1233            other => panic!("Expected ByteStringLit, got {:?}", other),
1234        }
1235    }
1236
1237    #[test]
1238    fn test_interpolated_string() {
1239        let mut lexer = Lexer::new(r#"f"hello {name}""#);
1240        match lexer.next_token() {
1241            Some((Token::InterpolatedStringLit(s), _)) => assert_eq!(s, "hello {name}"),
1242            other => panic!("Expected InterpolatedStringLit, got {:?}", other),
1243        }
1244    }
1245
1246    #[test]
1247    fn test_sigil_string_sql() {
1248        let mut lexer = Lexer::new(r#"σ"SELECT * FROM {table}""#);
1249        match lexer.next_token() {
1250            Some((Token::SigilStringSql(s), _)) => assert_eq!(s, "SELECT * FROM {table}"),
1251            other => panic!("Expected SigilStringSql, got {:?}", other),
1252        }
1253    }
1254
1255    #[test]
1256    fn test_sigil_string_route() {
1257        let mut lexer = Lexer::new(r#"ρ"/api/v1/{resource}/{id}""#);
1258        match lexer.next_token() {
1259            Some((Token::SigilStringRoute(s), _)) => assert_eq!(s, "/api/v1/{resource}/{id}"),
1260            other => panic!("Expected SigilStringRoute, got {:?}", other),
1261        }
1262    }
1263
1264    #[test]
1265    fn test_unicode_in_strings() {
1266        // Test direct Unicode in strings
1267        let mut lexer = Lexer::new(r#""τφσρ 你好 🦀""#);
1268        match lexer.next_token() {
1269            Some((Token::StringLit(s), _)) => assert_eq!(s, "τφσρ 你好 🦀"),
1270            other => panic!("Expected StringLit, got {:?}", other),
1271        }
1272    }
1273
1274    #[test]
1275    fn test_empty_string() {
1276        let mut lexer = Lexer::new(r#""""#);
1277        match lexer.next_token() {
1278            Some((Token::StringLit(s), _)) => assert_eq!(s, ""),
1279            other => panic!("Expected empty StringLit, got {:?}", other),
1280        }
1281    }
1282
1283    #[test]
1284    fn test_escape_sequence_helper() {
1285        // Unit test the helper function directly
1286        assert_eq!(process_escape_sequences(r"hello\nworld"), "hello\nworld");
1287        assert_eq!(process_escape_sequences(r"hello\tworld"), "hello\tworld");
1288        assert_eq!(process_escape_sequences(r"hello\\world"), "hello\\world");
1289        assert_eq!(process_escape_sequences(r#"hello\"world"#), "hello\"world");
1290        assert_eq!(process_escape_sequences(r"hello\x41world"), "helloAworld");
1291        assert_eq!(process_escape_sequences(r"hello\u{1F600}world"), "hello😀world");
1292    }
1293}
sigil_parser/lexer.rs

sigil_parser/
lexer.rs