1use crate::span::Span;
6use logos::Logos;
7
8fn process_escape_sequences(s: &str) -> String {
12 let mut result = String::with_capacity(s.len());
13 let mut chars = s.chars().peekable();
14
15 while let Some(c) = chars.next() {
16 if c == '\\' {
17 match chars.next() {
18 Some('\n') => {
19 while let Some(&c) = chars.peek() {
21 if c == ' ' || c == '\t' {
22 chars.next();
23 } else {
24 break;
25 }
26 }
27 }
28 Some('n') => result.push('\n'),
29 Some('t') => result.push('\t'),
30 Some('r') => result.push('\r'),
31 Some('\\') => result.push('\\'),
32 Some('"') => result.push('"'),
33 Some('\'') => result.push('\''),
34 Some('0') => result.push('\0'),
35 Some('x') => {
36 let mut hex = String::new();
38 for _ in 0..2 {
39 if let Some(&c) = chars.peek() {
40 if c.is_ascii_hexdigit() {
41 hex.push(chars.next().unwrap());
42 }
43 }
44 }
45 if let Ok(val) = u8::from_str_radix(&hex, 16) {
46 result.push(val as char);
47 }
48 }
49 Some('u') => {
50 if chars.peek() == Some(&'{') {
52 chars.next(); let mut hex = String::new();
54 while let Some(&c) = chars.peek() {
55 if c == '}' {
56 chars.next();
57 break;
58 }
59 if c.is_ascii_hexdigit() {
60 hex.push(chars.next().unwrap());
61 } else {
62 break;
63 }
64 }
65 if let Ok(val) = u32::from_str_radix(&hex, 16) {
66 if let Some(c) = char::from_u32(val) {
67 result.push(c);
68 }
69 }
70 }
71 }
72 Some(other) => {
73 result.push('\\');
75 result.push(other);
76 }
77 None => result.push('\\'),
78 }
79 } else {
80 result.push(c);
81 }
82 }
83 result
84}
85
86fn process_byte_escape_sequences(s: &str) -> Vec<u8> {
88 let mut result = Vec::with_capacity(s.len());
89 let mut chars = s.chars().peekable();
90
91 while let Some(c) = chars.next() {
92 if c == '\\' {
93 match chars.next() {
94 Some('\n') => {
95 while let Some(&c) = chars.peek() {
97 if c == ' ' || c == '\t' {
98 chars.next();
99 } else {
100 break;
101 }
102 }
103 }
104 Some('n') => result.push(b'\n'),
105 Some('t') => result.push(b'\t'),
106 Some('r') => result.push(b'\r'),
107 Some('\\') => result.push(b'\\'),
108 Some('"') => result.push(b'"'),
109 Some('\'') => result.push(b'\''),
110 Some('0') => result.push(0),
111 Some('x') => {
112 let mut hex = String::new();
114 for _ in 0..2 {
115 if let Some(&c) = chars.peek() {
116 if c.is_ascii_hexdigit() {
117 hex.push(chars.next().unwrap());
118 }
119 }
120 }
121 if let Ok(val) = u8::from_str_radix(&hex, 16) {
122 result.push(val);
123 }
124 }
125 Some(other) => {
126 result.push(b'\\');
128 if other.is_ascii() {
129 result.push(other as u8);
130 }
131 }
132 None => result.push(b'\\'),
133 }
134 } else if c.is_ascii() {
135 result.push(c as u8);
136 }
137 }
139 result
140}
141
142fn block_comment_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
147 let remainder = lex.remainder();
148
149 if let Some(end_pos) = remainder.find("*/") {
151 let content = &remainder[..end_pos];
152 lex.bump(end_pos + 2);
154 Some(content.to_string())
155 } else {
156 let len = remainder.len();
158 lex.bump(len);
159 Some(remainder.to_string())
160 }
161}
162
163fn raw_string_delimited_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
164 let remainder = lex.remainder();
165
166 if let Some(end_pos) = remainder.find("\"#") {
168 let content = &remainder[..end_pos];
169 lex.bump(end_pos + 2);
171 Some(content.to_string())
172 } else {
173 None
174 }
175}
176
177fn multiline_string_callback(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
180 let remainder = lex.remainder();
181
182 if let Some(end_pos) = remainder.find("\"\"\"") {
184 let content = &remainder[..end_pos];
185 lex.bump(end_pos + 3);
187 Some(process_escape_sequences(content))
188 } else {
189 None
191 }
192}
193
194fn process_char_escape(s: &str) -> char {
196 let mut chars = s.chars();
197 match chars.next() {
198 Some('\\') => match chars.next() {
199 Some('n') => '\n',
200 Some('t') => '\t',
201 Some('r') => '\r',
202 Some('\\') => '\\',
203 Some('"') => '"',
204 Some('\'') => '\'',
205 Some('0') => '\0',
206 Some('x') => {
207 let hex: String = chars.take(2).collect();
208 u8::from_str_radix(&hex, 16)
209 .map(|v| v as char)
210 .unwrap_or('?')
211 }
212 Some('u') => {
213 if chars.next() == Some('{') {
214 let hex: String = chars.take_while(|&c| c != '}').collect();
215 u32::from_str_radix(&hex, 16)
216 .ok()
217 .and_then(char::from_u32)
218 .unwrap_or('?')
219 } else {
220 '?'
221 }
222 }
223 Some(c) => c,
224 None => '?',
225 },
226 Some(c) => c,
227 None => '?',
228 }
229}
230
231fn process_byte_char_escape(s: &str) -> u8 {
233 let mut chars = s.chars();
234 match chars.next() {
235 Some('\\') => match chars.next() {
236 Some('n') => b'\n',
237 Some('t') => b'\t',
238 Some('r') => b'\r',
239 Some('\\') => b'\\',
240 Some('"') => b'"',
241 Some('\'') => b'\'',
242 Some('0') => b'\0',
243 Some('x') => {
244 let hex: String = chars.take(2).collect();
245 u8::from_str_radix(&hex, 16).unwrap_or(b'?')
246 }
247 Some(c) => c as u8,
248 None => b'?',
249 },
250 Some(c) => c as u8,
251 None => b'?',
252 }
253}
254
255#[derive(Logos, Debug, Clone, PartialEq)]
257#[logos(skip r"[ \t\r\n\f]+")]
258pub enum Token {
259 #[regex(r"//[^\n]*", |lex| lex.slice().to_string())]
261 LineComment(String),
262
263 #[regex(r"//![^\n]*", |lex| lex.slice().to_string())]
264 DocComment(String),
265
266 #[regex(r"~~[^\n]*", |lex| lex.slice().to_string())]
268 TildeComment(String),
269
270 #[token("/*", block_comment_callback)]
272 BlockComment(String),
273
274 #[token("fn")]
276 Fn,
277 #[token("async")]
278 Async,
279 #[token("let")]
280 Let,
281 #[token("mut")]
282 Mut,
283 #[token("const")]
284 Const,
285 #[token("type")]
286 Type,
287 #[token("struct")]
288 #[token("sigil")] Struct,
290 #[token("enum")]
291 Enum,
292 #[token("trait")]
293 Trait,
294 #[token("impl")]
295 Impl,
296 #[token("mod")]
297 #[token("scroll")] Mod,
299 #[token("use")]
300 #[token("invoke")] Use,
302 #[token("pub")]
303 Pub,
304 #[token("actor")]
305 Actor,
306 #[token("saga")]
307 Saga,
308 #[token("scope")]
309 Scope,
310 #[token("rune")]
311 Rune,
312 #[token("macro")]
313 Macro,
314 #[token("macro_rules")]
315 MacroRules,
316
317 #[token("if")]
319 If,
320 #[token("else")]
321 Else,
322 #[token("match")]
323 Match,
324 #[token("loop")]
325 Loop,
326 #[token("while")]
327 While,
328 #[token("for")]
329 For,
330 #[token("in")]
331 In,
332 #[token("break")]
333 Break,
334 #[token("continue")]
335 Continue,
336 #[token("return")]
337 Return,
338 #[token("yield")]
339 Yield,
340 #[token("await")]
341 Await,
342
343 #[token("self")]
345 SelfLower,
346 #[token("Self")]
347 SelfUpper,
348 #[token("super")]
349 Super,
350 #[token("crate")]
351 #[token("tome")] Crate,
353 #[token("where")]
354 Where,
355 #[token("as")]
356 As,
357 #[token("dyn")]
358 Dyn,
359 #[token("move")]
360 Move,
361 #[token("ref")]
362 Ref,
363 #[token("static")]
364 Static,
365 #[token("unsafe")]
366 Unsafe,
367 #[token("extern")]
368 Extern,
369 #[token("asm")]
370 Asm,
371 #[token("volatile")]
372 Volatile,
373 #[token("naked")]
374 Naked,
375 #[token("packed")]
376 Packed,
377 #[token("simd")]
378 Simd,
379 #[token("atomic")]
380 Atomic,
381 #[token("derive")]
382 Derive,
383 #[token("on")]
384 On,
385
386 #[token("alter")]
388 Alter,
389 #[token("switch")]
390 Switch,
391 #[token("headspace")]
392 Headspace,
393 #[token("cocon")]
394 CoCon,
395 #[token("reality")]
396 Reality,
397 #[token("split")]
398 Split,
399 #[token("trigger")]
400 Trigger,
401 #[token("layer")]
402 Layer,
403 #[token("location")]
404 Location,
405 #[token("states")]
406 States,
407 #[token("anima")]
408 Anima,
409 #[token("to")]
410 To,
411 #[token("from")]
412 From,
413
414 #[token("@!")]
416 AlterSourceFronting,
417 #[token("@~")]
418 AlterSourceCoCon,
419 #[token("@?")]
420 AlterSourceDormant,
421 #[token("@‽")]
422 AlterSourceBlended,
423
424 #[token("true")]
426 True,
427 #[token("false")]
428 False,
429
430 #[token("null")]
432 Null,
433
434 #[token("τ")]
436 #[token("Τ")]
437 Tau, #[token("φ")]
440 #[token("Φ")]
441 Phi, #[token("σ")]
444 #[token("Σ")]
445 Sigma, #[token("ρ")]
448 #[token("Ρ")]
449 Rho, #[token("λ")]
452 #[token("Λ")]
453 Lambda, #[token("Π")]
456 Pi, #[token("⌛")]
459 Hourglass, #[token("δ")]
463 #[token("Δ")]
464 Delta, #[token("ε")]
467 Epsilon, #[token("ω")]
470 #[token("Ω")]
471 Omega, #[token("α")]
474 Alpha, #[token("ζ")]
477 Zeta, #[token("μ")]
481 #[token("Μ")]
482 Mu, #[token("χ")]
485 #[token("Χ")]
486 Chi, #[token("ν")]
489 #[token("Ν")]
490 Nu, #[token("ξ")]
493 #[token("Ξ")]
494 Xi, #[token("ψ")]
497 #[token("Ψ")]
498 Psi, #[token("θ")]
501 #[token("Θ")]
502 Theta, #[token("κ")]
505 #[token("Κ")]
506 Kappa, #[token("∥")]
510 #[token("parallel")]
511 Parallel, #[token("⊛")]
514 #[token("gpu")]
515 Gpu, #[token("∀")]
519 ForAll, #[token("∃")]
522 Exists, #[token("∈")]
525 ElementOf, #[token("∉")]
528 NotElementOf, #[token("∪")]
532 Union, #[token("∩")]
535 Intersection, #[token("∖")]
538 SetMinus, #[token("⊂")]
541 Subset, #[token("⊆")]
544 SubsetEq, #[token("⊃")]
547 Superset, #[token("⊇")]
550 SupersetEq, #[token("∧")]
554 LogicAnd, #[token("∨")]
557 LogicOr, #[token("¬")]
560 LogicNot, #[token("⊻")]
563 LogicXor, #[token("⊤")]
566 Top, #[token("⊥")]
569 Bottom, #[token("⋏")]
573 BitwiseAndSymbol, #[token("⋎")]
576 BitwiseOrSymbol, #[token("⊙")]
579 CircledDot, #[token("∷")]
585 TypeAnnotation, #[token("∫")]
589 Integral, #[token("∂")]
592 Partial, #[token("√")]
595 Sqrt, #[token("∛")]
598 Cbrt, #[token("∇")]
601 Nabla, #[token("⍋")]
605 GradeUp, #[token("⍒")]
608 GradeDown, #[token("⌽")]
611 Rotate, #[token("↻")]
614 CycleArrow, #[token("⌺")]
617 QuadDiamond, #[token("⊞")]
620 SquaredPlus, #[token("⍳")]
623 Iota, #[token("∘")]
627 Compose, #[token("⊗")]
630 Tensor, #[token("⊕")]
633 DirectSum, #[token("⋈")]
637 Bowtie, #[token("⋳")]
640 ElementSmallVerticalBar, #[token("⊔")]
643 SquareCup, #[token("⊓")]
646 SquareCap, #[token("‽")]
651 Interrobang, #[token("◊")]
654 Lozenge, #[token("∿")]
660 #[token("legion_field")]
661 LegionField, #[token("⫰")]
664 #[token("interfere")]
665 Interfere, #[token("⟁")]
668 #[token("distribute")]
669 Distribute, #[token("⟀")]
672 #[token("gather")]
673 Gather, #[token("↠")]
676 #[token("broadcast")]
677 Broadcast, #[token("⇢")]
680 #[token("consensus")]
681 Consensus, #[token("⊕=")]
685 DirectSumEq, #[token("∂=")]
688 PartialEq_, #[token("⫰=")]
691 InterfereEq, #[token("⊖")]
696 AffectNegative, #[token("⊜")]
699 AffectNeutral, #[token("⸮")]
705 IronyMark, #[token("↑")]
709 IntensityUp, #[token("↓")]
712 IntensityDown, #[token("⇈")]
715 IntensityMax, #[token("♔")]
719 FormalRegister, #[token("♟")]
722 InformalRegister, #[token("☺")]
726 EmotionJoy, #[token("☹")]
729 EmotionSadness, #[token("⚡")]
732 EmotionAnger, #[token("❄")]
735 EmotionFear, #[token("✦")]
738 EmotionSurprise, #[token("♡")]
741 EmotionLove, #[token("◉")]
745 ConfidenceHigh, #[token("◎")]
748 ConfidenceMedium, #[token("○")]
751 ConfidenceLow, #[token("·ing")]
755 AspectProgressive, #[token("·ed")]
758 AspectPerfective, #[token("·able")]
761 AspectPotential, #[token("·ive")]
764 AspectResultative, #[token("|")]
768 Pipe,
769 #[token("·")]
770 MiddleDot, #[token("->")]
772 Arrow,
773 #[token("=>")]
774 FatArrow,
775 #[token("<-")]
776 LeftArrow,
777 #[token("==")]
778 EqEq,
779 #[token("!=")]
780 NotEq,
781 #[token("<=")]
782 LtEq,
783 #[token(">=")]
784 GtEq,
785 #[token("<")]
786 Lt,
787 #[token(">")]
788 Gt,
789 #[token("+")]
790 Plus,
791 #[token("-")]
792 Minus,
793 #[token("*")]
794 Star,
795 #[token("/")]
796 Slash,
797 #[token("%")]
798 Percent,
799 #[token("**")]
800 StarStar, #[token("&&")]
802 AndAnd,
803 #[token("||")]
804 OrOr,
805 #[token("!")]
806 Bang, #[token("?")]
808 Question, #[token("~")]
810 Tilde, #[token("&")]
812 Amp,
813 #[token("^")]
814 Caret,
815 #[token("<<=")]
816 ShlEq,
817 #[token(">>=")]
818 ShrEq,
819 #[token("<<")]
820 Shl,
821 #[token(">>")]
822 Shr,
823 #[token("=")]
824 Eq,
825 #[token("+=")]
826 PlusEq,
827 #[token("-=")]
828 MinusEq,
829 #[token("*=")]
830 StarEq,
831 #[token("/=")]
832 SlashEq,
833 #[token("%=")]
834 PercentEq,
835 #[token("|=")]
836 PipeEq,
837 #[token("&=")]
838 AmpEq,
839 #[token("^=")]
840 CaretEq,
841 #[token("..")]
842 DotDot,
843 #[token("..=")]
844 DotDotEq,
845 #[token("++")]
846 PlusPlus, #[token("::")]
848 ColonColon,
849 #[token(":")]
850 Colon,
851 #[token(";")]
852 Semi,
853 #[token(",")]
854 Comma,
855 #[token(".")]
856 Dot,
857 #[token("@")]
858 At,
859 #[token("#!")]
860 HashBang, #[token("#")]
862 Hash,
863 #[token("_", priority = 3)]
864 Underscore,
865
866 #[token("(")]
868 LParen,
869 #[token(")")]
870 RParen,
871 #[token("{")]
872 LBrace,
873 #[token("}")]
874 RBrace,
875 #[token("[")]
876 LBracket,
877 #[token("]")]
878 RBracket,
879
880 #[token("∅")]
882 Empty, #[token("◯")]
884 Circle, #[token("∞")]
886 Infinity, #[token("⇒")]
890 ProtoSend, #[token("⇐")]
893 ProtoRecv, #[token("≋")]
896 ProtoStream, #[token("⊸")]
899 ProtoConnect, #[token("⏱")]
902 ProtoTimeout, #[token("send")]
908 Send,
909 #[token("recv")]
910 Recv,
911 #[token("stream")]
912 Stream,
913 #[token("connect")]
914 Connect,
915 #[token("close")]
916 Close,
917 #[token("timeout")]
918 Timeout,
919 #[token("retry")]
920 Retry,
921 #[token("header")]
922 Header,
923 #[token("body")]
924 Body,
925
926 #[token("http")]
928 Http,
929 #[token("https")]
930 Https,
931 #[token("ws")]
932 Ws,
933 #[token("wss")]
934 Wss,
935 #[token("grpc")]
936 Grpc,
937 #[token("kafka")]
938 Kafka,
939 #[token("amqp")]
940 Amqp,
941 #[token("graphql")]
942 GraphQL,
943
944 #[regex(r"0b[01_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
947 BinaryLit(String),
948
949 #[regex(r"0o[0-7_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
951 OctalLit(String),
952
953 #[regex(r"0x[0-9a-fA-F_]+(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
955 HexLit(String),
956
957 #[regex(r"0v[0-9a-jA-J_]+", |lex| lex.slice().to_string())]
959 VigesimalLit(String),
960
961 #[regex(r"0s[0-9a-zA-Z_]+", |lex| lex.slice().to_string())]
963 SexagesimalLit(String),
964
965 #[regex(r"0z[0-9a-bA-B_]+", |lex| lex.slice().to_string())]
967 DuodecimalLit(String),
968
969 #[regex(r"([0-9][0-9_]*\.[0-9][0-9_]*([eE][+-]?[0-9_]+)?|[0-9][0-9_]*[eE][+-]?[0-9_]+)(f16|f32|f64|f128)?", |lex| lex.slice().to_string())]
972 FloatLit(String),
973
974 #[regex(r"[0-9][0-9_]*(i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| lex.slice().to_string())]
976 IntLit(String),
977
978 #[regex(r#""([^"\\]|\\(.|\n))*""#, |lex| {
982 let s = lex.slice();
983 let inner = &s[1..s.len()-1];
984 process_escape_sequences(inner)
985 })]
986 StringLit(String),
987
988 #[token(r#"""""#, multiline_string_callback)]
990 MultiLineStringLit(String),
991
992 #[regex(r#"b"([^"\\]|\\.)*""#, |lex| {
994 let s = lex.slice();
995 let inner = &s[2..s.len()-1];
996 process_byte_escape_sequences(inner)
997 })]
998 ByteStringLit(Vec<u8>),
999
1000 #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
1002 let s = lex.slice();
1003 let inner = &s[2..s.len()-1];
1004 process_escape_sequences(inner)
1005 })]
1006 InterpolatedStringLit(String),
1007
1008 #[regex(r#"σ"([^"\\]|\\.)*""#, |lex| {
1010 let s = lex.slice();
1011 let start = "σ".len() + 1; let inner = &s[start..s.len()-1];
1014 process_escape_sequences(inner)
1015 })]
1016 SigilStringSql(String),
1017
1018 #[regex(r#"ρ"([^"\\]|\\.)*""#, |lex| {
1020 let s = lex.slice();
1021 let start = "ρ".len() + 1; let inner = &s[start..s.len()-1];
1024 process_escape_sequences(inner)
1025 })]
1026 SigilStringRoute(String),
1027
1028 #[regex(r"'([^'\\]|\\x[0-9a-fA-F]{2}|\\u\{[0-9a-fA-F]{1,6}\}|\\.)'", |lex| {
1031 let s = lex.slice();
1032 let inner = &s[1..s.len()-1];
1033 process_char_escape(inner)
1034 })]
1035 CharLit(char),
1036
1037 #[regex(r"b'([^'\\]|\\x[0-9a-fA-F]{2}|\\.)'", |lex| {
1039 let s = lex.slice();
1040 let inner = &s[2..s.len()-1];
1042 process_byte_char_escape(inner)
1043 })]
1044 ByteCharLit(u8),
1045
1046 #[regex(r#"r"([^"\\]|\\.)*""#, |lex| {
1048 let s = lex.slice();
1049 s[2..s.len()-1].to_string()
1050 })]
1051 RawStringLit(String),
1052
1053 #[token(r##"r#""##, raw_string_delimited_callback)]
1055 RawStringDelimited(String),
1056
1057 #[regex(r"'[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice()[1..].to_string())]
1059 Lifetime(String),
1060
1061 #[regex(r"[a-zA-Z_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ][a-zA-Z0-9_αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]*", |lex| lex.slice().to_string())]
1065 Ident(String),
1066
1067 #[regex(r"//@\s*rune:\s*[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
1069 RuneAnnotation(String),
1070}
1071
1072impl Token {
1073 pub fn is_keyword(&self) -> bool {
1074 matches!(
1075 self,
1076 Token::Fn
1077 | Token::Async
1078 | Token::Let
1079 | Token::Mut
1080 | Token::Const
1081 | Token::Type
1082 | Token::Struct
1083 | Token::Enum
1084 | Token::Trait
1085 | Token::Impl
1086 | Token::Mod
1087 | Token::Use
1088 | Token::Pub
1089 | Token::Actor
1090 | Token::Saga
1091 | Token::Scope
1092 | Token::Rune
1093 | Token::If
1094 | Token::Else
1095 | Token::Match
1096 | Token::Loop
1097 | Token::While
1098 | Token::For
1099 | Token::In
1100 | Token::Break
1101 | Token::Continue
1102 | Token::Return
1103 | Token::Yield
1104 | Token::Await
1105 ) || self.is_plurality_keyword()
1106 }
1107
1108 pub fn is_plurality_keyword(&self) -> bool {
1109 matches!(
1110 self,
1111 Token::Alter
1112 | Token::Switch
1113 | Token::Headspace
1114 | Token::CoCon
1115 | Token::Reality
1116 | Token::Split
1117 | Token::Trigger
1118 | Token::Layer
1119 | Token::Location
1120 | Token::States
1121 | Token::Anima
1122 | Token::To
1123 | Token::From
1124 )
1125 }
1126
1127 pub fn is_alter_source(&self) -> bool {
1128 matches!(
1129 self,
1130 Token::AlterSourceFronting
1131 | Token::AlterSourceCoCon
1132 | Token::AlterSourceDormant
1133 | Token::AlterSourceBlended
1134 )
1135 }
1136
1137 pub fn is_morpheme(&self) -> bool {
1138 matches!(
1139 self,
1140 Token::Tau | Token::Phi | Token::Sigma | Token::Rho |
1141 Token::Lambda | Token::Pi | Token::Hourglass |
1142 Token::Delta | Token::Epsilon | Token::Omega | Token::Alpha | Token::Zeta |
1143 Token::Mu | Token::Chi | Token::Nu | Token::Xi | Token::Parallel | Token::Gpu | Token::Integral | Token::Partial | Token::Sqrt | Token::Cbrt |
1146 Token::Compose
1147 )
1148 }
1149
1150 pub fn is_aspect(&self) -> bool {
1151 matches!(
1152 self,
1153 Token::AspectProgressive
1154 | Token::AspectPerfective
1155 | Token::AspectPotential
1156 | Token::AspectResultative
1157 )
1158 }
1159
1160 pub fn is_data_op(&self) -> bool {
1161 matches!(
1162 self,
1163 Token::Bowtie | Token::ElementSmallVerticalBar | Token::SquareCup | Token::SquareCap
1164 )
1165 }
1166
1167 pub fn is_bitwise_symbol(&self) -> bool {
1168 matches!(self, Token::BitwiseAndSymbol | Token::BitwiseOrSymbol)
1169 }
1170
1171 pub fn is_quantifier(&self) -> bool {
1172 matches!(
1173 self,
1174 Token::ForAll | Token::Exists | Token::ElementOf | Token::NotElementOf
1175 )
1176 }
1177
1178 pub fn is_set_op(&self) -> bool {
1179 matches!(
1180 self,
1181 Token::Union
1182 | Token::Intersection
1183 | Token::SetMinus
1184 | Token::Subset
1185 | Token::SubsetEq
1186 | Token::Superset
1187 | Token::SupersetEq
1188 )
1189 }
1190
1191 pub fn is_logic_op(&self) -> bool {
1192 matches!(
1193 self,
1194 Token::LogicAnd
1195 | Token::LogicOr
1196 | Token::LogicNot
1197 | Token::LogicXor
1198 | Token::Top
1199 | Token::Bottom
1200 )
1201 }
1202
1203 pub fn is_evidentiality(&self) -> bool {
1204 matches!(
1205 self,
1206 Token::Bang | Token::Question | Token::Tilde | Token::Interrobang | Token::Lozenge
1207 )
1208 }
1209
1210 pub fn is_legion_morpheme(&self) -> bool {
1211 matches!(
1212 self,
1213 Token::LegionField | Token::DirectSum | Token::Interfere | Token::ConfidenceHigh | Token::Distribute | Token::Gather | Token::Broadcast | Token::Consensus | Token::Partial )
1223 }
1224
1225 pub fn is_legion_assign(&self) -> bool {
1226 matches!(
1227 self,
1228 Token::DirectSumEq | Token::PartialEq_ | Token::InterfereEq
1229 )
1230 }
1231
1232 pub fn is_affective(&self) -> bool {
1233 matches!(
1234 self,
1235 Token::DirectSum | Token::AffectNegative | Token::AffectNeutral | Token::IronyMark | Token::IntensityUp | Token::IntensityDown | Token::IntensityMax | Token::FormalRegister | Token::InformalRegister | Token::EmotionJoy | Token::EmotionSadness | Token::EmotionAnger | Token::EmotionFear | Token::EmotionSurprise | Token::EmotionLove | Token::ConfidenceHigh | Token::ConfidenceMedium | Token::ConfidenceLow )
1260 }
1261
1262 pub fn is_sentiment(&self) -> bool {
1263 matches!(
1264 self,
1265 Token::DirectSum | Token::AffectNegative | Token::AffectNeutral
1266 )
1267 }
1268
1269 pub fn is_emotion(&self) -> bool {
1270 matches!(
1271 self,
1272 Token::EmotionJoy
1273 | Token::EmotionSadness
1274 | Token::EmotionAnger
1275 | Token::EmotionFear
1276 | Token::EmotionSurprise
1277 | Token::EmotionLove
1278 )
1279 }
1280
1281 pub fn is_intensity(&self) -> bool {
1282 matches!(
1283 self,
1284 Token::IntensityUp | Token::IntensityDown | Token::IntensityMax
1285 )
1286 }
1287}
1288
1289pub struct Lexer<'a> {
1291 inner: logos::Lexer<'a, Token>,
1292 buffer: Vec<Option<(Token, Span)>>,
1294}
1295
1296impl<'a> Lexer<'a> {
1297 pub fn new(source: &'a str) -> Self {
1298 Self {
1299 inner: Token::lexer(source),
1300 buffer: Vec::new(),
1301 }
1302 }
1303
1304 fn read_next(&mut self) -> Option<(Token, Span)> {
1306 match self.inner.next() {
1307 Some(Ok(token)) => {
1308 let span = self.inner.span();
1309 Some((token, Span::new(span.start, span.end)))
1310 }
1311 Some(Err(_)) => {
1312 self.read_next()
1314 }
1315 None => None,
1316 }
1317 }
1318
1319 pub fn next_token(&mut self) -> Option<(Token, Span)> {
1320 if !self.buffer.is_empty() {
1321 return self.buffer.remove(0);
1324 }
1325 self.read_next()
1326 }
1327
1328 pub fn peek(&mut self) -> Option<&(Token, Span)> {
1329 self.peek_n(0)
1330 }
1331
1332 pub fn peek_n(&mut self, n: usize) -> Option<&(Token, Span)> {
1334 while self.buffer.len() <= n {
1336 let token = self.read_next();
1337 self.buffer.push(token);
1338 }
1339 self.buffer.get(n).and_then(|opt| opt.as_ref())
1340 }
1341
1342 pub fn span(&self) -> Span {
1343 let span = self.inner.span();
1344 Span::new(span.start, span.end)
1345 }
1346}
1347
1348#[cfg(test)]
1349mod tests {
1350 use super::*;
1351
1352 #[test]
1353 fn test_morphemes() {
1354 let mut lexer = Lexer::new("τ φ σ ρ λ Σ Π ⌛");
1355 assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1356 assert!(matches!(lexer.next_token(), Some((Token::Phi, _))));
1357 assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1358 assert!(matches!(lexer.next_token(), Some((Token::Rho, _))));
1359 assert!(matches!(lexer.next_token(), Some((Token::Lambda, _))));
1360 assert!(matches!(lexer.next_token(), Some((Token::Sigma, _))));
1361 assert!(matches!(lexer.next_token(), Some((Token::Pi, _))));
1362 assert!(matches!(lexer.next_token(), Some((Token::Hourglass, _))));
1363 }
1364
1365 #[test]
1366 fn test_evidentiality() {
1367 let mut lexer = Lexer::new("value! uncertain? reported~ paradox‽");
1368 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "value"));
1369 assert!(matches!(lexer.next_token(), Some((Token::Bang, _))));
1370 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "uncertain"));
1371 assert!(matches!(lexer.next_token(), Some((Token::Question, _))));
1372 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "reported"));
1373 assert!(matches!(lexer.next_token(), Some((Token::Tilde, _))));
1374 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "paradox"));
1375 assert!(matches!(lexer.next_token(), Some((Token::Interrobang, _))));
1376 }
1377
1378 #[test]
1379 fn test_pipe_chain() {
1380 let mut lexer = Lexer::new("data|τ{f}|φ{p}|σ");
1381 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "data"));
1382 assert!(matches!(lexer.next_token(), Some((Token::Pipe, _))));
1383 assert!(matches!(lexer.next_token(), Some((Token::Tau, _))));
1384 assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1385 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1386 assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1387 }
1388
1389 #[test]
1390 fn test_numbers() {
1391 let mut lexer = Lexer::new("42 0b1010 0o52 0x2A 0v22 0s42 3.14");
1392 assert!(matches!(lexer.next_token(), Some((Token::IntLit(s), _)) if s == "42"));
1393 assert!(matches!(lexer.next_token(), Some((Token::BinaryLit(s), _)) if s == "0b1010"));
1394 assert!(matches!(lexer.next_token(), Some((Token::OctalLit(s), _)) if s == "0o52"));
1395 assert!(matches!(lexer.next_token(), Some((Token::HexLit(s), _)) if s == "0x2A"));
1396 assert!(matches!(lexer.next_token(), Some((Token::VigesimalLit(s), _)) if s == "0v22"));
1397 assert!(matches!(lexer.next_token(), Some((Token::SexagesimalLit(s), _)) if s == "0s42"));
1398 assert!(matches!(lexer.next_token(), Some((Token::FloatLit(s), _)) if s == "3.14"));
1399 }
1400
1401 #[test]
1402 fn test_incorporation() {
1403 let mut lexer = Lexer::new("file·open·read");
1404 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "file"));
1405 assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1406 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "open"));
1407 assert!(matches!(lexer.next_token(), Some((Token::MiddleDot, _))));
1408 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "read"));
1409 }
1410
1411 #[test]
1412 fn test_special_symbols() {
1413 let mut lexer = Lexer::new("∅ ◯ ∞");
1414 assert!(matches!(lexer.next_token(), Some((Token::Empty, _))));
1415 assert!(matches!(lexer.next_token(), Some((Token::Circle, _))));
1416 assert!(matches!(lexer.next_token(), Some((Token::Infinity, _))));
1417 }
1418
1419 #[test]
1420 fn test_quantifiers() {
1421 let mut lexer = Lexer::new("∀x ∃y x∈S y∉T");
1422 assert!(matches!(lexer.next_token(), Some((Token::ForAll, _))));
1423 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1424 assert!(matches!(lexer.next_token(), Some((Token::Exists, _))));
1425 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "y"));
1426 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "x"));
1427 assert!(matches!(lexer.next_token(), Some((Token::ElementOf, _))));
1428 }
1429
1430 #[test]
1431 fn test_set_operations() {
1432 let mut lexer = Lexer::new("A∪B A∩B A∖B A⊂B A⊆B");
1433 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1434 assert!(matches!(lexer.next_token(), Some((Token::Union, _))));
1435 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "B"));
1436 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "A"));
1437 assert!(matches!(lexer.next_token(), Some((Token::Intersection, _))));
1438 }
1439
1440 #[test]
1441 fn test_logic_operators() {
1442 let mut lexer = Lexer::new("p∧q p∨q ¬p p⊻q ⊤ ⊥");
1443 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1444 assert!(matches!(lexer.next_token(), Some((Token::LogicAnd, _))));
1445 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "q"));
1446 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "p"));
1447 assert!(matches!(lexer.next_token(), Some((Token::LogicOr, _))));
1448 }
1449
1450 #[test]
1451 fn test_analysis_operators() {
1452 let mut lexer = Lexer::new("∫f ∂g √x ∛y f∘g");
1453 assert!(matches!(lexer.next_token(), Some((Token::Integral, _))));
1454 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "f"));
1455 assert!(matches!(lexer.next_token(), Some((Token::Partial, _))));
1456 assert!(matches!(lexer.next_token(), Some((Token::Ident(s), _)) if s == "g"));
1457 assert!(matches!(lexer.next_token(), Some((Token::Sqrt, _))));
1458 }
1459
1460 #[test]
1461 fn test_additional_morphemes() {
1462 let mut lexer = Lexer::new("δ ε ω α ζ");
1463 assert!(matches!(lexer.next_token(), Some((Token::Delta, _))));
1464 assert!(matches!(lexer.next_token(), Some((Token::Epsilon, _))));
1465 assert!(matches!(lexer.next_token(), Some((Token::Omega, _))));
1466 assert!(matches!(lexer.next_token(), Some((Token::Alpha, _))));
1467 assert!(matches!(lexer.next_token(), Some((Token::Zeta, _))));
1468 }
1469
1470 #[test]
1471 fn test_ffi_keywords() {
1472 let mut lexer = Lexer::new("extern unsafe");
1473 assert!(matches!(lexer.next_token(), Some((Token::Extern, _))));
1474 assert!(matches!(lexer.next_token(), Some((Token::Unsafe, _))));
1475 }
1476
1477 #[test]
1478 fn test_parallel_morphemes() {
1479 let mut lexer = Lexer::new("∥ parallel ⊛ gpu");
1480 assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1481 assert!(matches!(lexer.next_token(), Some((Token::Parallel, _))));
1482 assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1483 assert!(matches!(lexer.next_token(), Some((Token::Gpu, _))));
1484 }
1485
1486 #[test]
1487 fn test_lifetime_labels() {
1488 let mut lexer = Lexer::new("'outer: loop { break 'outer }");
1490 assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1491 assert!(matches!(lexer.next_token(), Some((Token::Colon, _))));
1492 assert!(matches!(lexer.next_token(), Some((Token::Loop, _))));
1493 assert!(matches!(lexer.next_token(), Some((Token::LBrace, _))));
1494 assert!(matches!(lexer.next_token(), Some((Token::Break, _))));
1495 assert!(matches!(lexer.next_token(), Some((Token::Lifetime(s), _)) if s == "outer"));
1496 assert!(matches!(lexer.next_token(), Some((Token::RBrace, _))));
1497 }
1498
1499 #[test]
1502 fn test_string_escape_sequences() {
1503 let mut lexer = Lexer::new(r#""hello\nworld""#);
1505 match lexer.next_token() {
1506 Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\nworld"),
1507 other => panic!("Expected StringLit, got {:?}", other),
1508 }
1509
1510 let mut lexer = Lexer::new(r#""hello\tworld""#);
1512 match lexer.next_token() {
1513 Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\tworld"),
1514 other => panic!("Expected StringLit, got {:?}", other),
1515 }
1516
1517 let mut lexer = Lexer::new(r#""hello\rworld""#);
1519 match lexer.next_token() {
1520 Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\rworld"),
1521 other => panic!("Expected StringLit, got {:?}", other),
1522 }
1523
1524 let mut lexer = Lexer::new(r#""hello\\world""#);
1526 match lexer.next_token() {
1527 Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\\world"),
1528 other => panic!("Expected StringLit, got {:?}", other),
1529 }
1530
1531 let mut lexer = Lexer::new(r#""hello\"world""#);
1533 match lexer.next_token() {
1534 Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\"world"),
1535 other => panic!("Expected StringLit, got {:?}", other),
1536 }
1537
1538 let mut lexer = Lexer::new(r#""hello\0world""#);
1540 match lexer.next_token() {
1541 Some((Token::StringLit(s), _)) => assert_eq!(s, "hello\0world"),
1542 other => panic!("Expected StringLit, got {:?}", other),
1543 }
1544 }
1545
1546 #[test]
1547 fn test_string_hex_escape() {
1548 let mut lexer = Lexer::new(r#""hello\x41world""#);
1550 match lexer.next_token() {
1551 Some((Token::StringLit(s), _)) => assert_eq!(s, "helloAworld"),
1552 other => panic!("Expected StringLit, got {:?}", other),
1553 }
1554 }
1555
1556 #[test]
1557 fn test_string_unicode_escape() {
1558 let mut lexer = Lexer::new(r#""hello\u{1F600}world""#);
1560 match lexer.next_token() {
1561 Some((Token::StringLit(s), _)) => assert_eq!(s, "hello😀world"),
1562 other => panic!("Expected StringLit, got {:?}", other),
1563 }
1564
1565 let mut lexer = Lexer::new(r#""\u{03C4}""#);
1567 match lexer.next_token() {
1568 Some((Token::StringLit(s), _)) => assert_eq!(s, "τ"),
1569 other => panic!("Expected StringLit, got {:?}", other),
1570 }
1571 }
1572
1573 #[test]
1574 fn test_char_escape_sequences() {
1575 let mut lexer = Lexer::new(r"'\n'");
1576 match lexer.next_token() {
1577 Some((Token::CharLit(c), _)) => assert_eq!(c, '\n'),
1578 other => panic!("Expected CharLit, got {:?}", other),
1579 }
1580
1581 let mut lexer = Lexer::new(r"'\t'");
1582 match lexer.next_token() {
1583 Some((Token::CharLit(c), _)) => assert_eq!(c, '\t'),
1584 other => panic!("Expected CharLit, got {:?}", other),
1585 }
1586
1587 let mut lexer = Lexer::new(r"'\\'");
1588 match lexer.next_token() {
1589 Some((Token::CharLit(c), _)) => assert_eq!(c, '\\'),
1590 other => panic!("Expected CharLit, got {:?}", other),
1591 }
1592 }
1593
1594 #[test]
1595 fn test_raw_string() {
1596 let mut lexer = Lexer::new(r#"r"hello\nworld""#);
1598 match lexer.next_token() {
1599 Some((Token::RawStringLit(s), _)) => assert_eq!(s, r"hello\nworld"),
1600 other => panic!("Expected RawStringLit, got {:?}", other),
1601 }
1602 }
1603
1604 #[test]
1605 fn test_raw_string_delimited() {
1606 let mut lexer = Lexer::new(r##"r#"hello "world""#"##);
1608 match lexer.next_token() {
1609 Some((Token::RawStringDelimited(s), _)) => assert_eq!(s, r#"hello "world""#),
1610 other => panic!("Expected RawStringDelimited, got {:?}", other),
1611 }
1612 }
1613
1614 #[test]
1615 fn test_byte_string() {
1616 let mut lexer = Lexer::new(r#"b"hello""#);
1617 match lexer.next_token() {
1618 Some((Token::ByteStringLit(bytes), _)) => {
1619 assert_eq!(bytes, vec![104, 101, 108, 108, 111]); }
1621 other => panic!("Expected ByteStringLit, got {:?}", other),
1622 }
1623 }
1624
1625 #[test]
1626 fn test_interpolated_string() {
1627 let mut lexer = Lexer::new(r#"f"hello {name}""#);
1628 match lexer.next_token() {
1629 Some((Token::InterpolatedStringLit(s), _)) => assert_eq!(s, "hello {name}"),
1630 other => panic!("Expected InterpolatedStringLit, got {:?}", other),
1631 }
1632 }
1633
1634 #[test]
1635 fn test_sigil_string_sql() {
1636 let mut lexer = Lexer::new(r#"σ"SELECT * FROM {table}""#);
1637 match lexer.next_token() {
1638 Some((Token::SigilStringSql(s), _)) => assert_eq!(s, "SELECT * FROM {table}"),
1639 other => panic!("Expected SigilStringSql, got {:?}", other),
1640 }
1641 }
1642
1643 #[test]
1644 fn test_sigil_string_route() {
1645 let mut lexer = Lexer::new(r#"ρ"/api/v1/{resource}/{id}""#);
1646 match lexer.next_token() {
1647 Some((Token::SigilStringRoute(s), _)) => assert_eq!(s, "/api/v1/{resource}/{id}"),
1648 other => panic!("Expected SigilStringRoute, got {:?}", other),
1649 }
1650 }
1651
1652 #[test]
1653 fn test_unicode_in_strings() {
1654 let mut lexer = Lexer::new(r#""τφσρ 你好 🦀""#);
1656 match lexer.next_token() {
1657 Some((Token::StringLit(s), _)) => assert_eq!(s, "τφσρ 你好 🦀"),
1658 other => panic!("Expected StringLit, got {:?}", other),
1659 }
1660 }
1661
1662 #[test]
1663 fn test_empty_string() {
1664 let mut lexer = Lexer::new(r#""""#);
1665 match lexer.next_token() {
1666 Some((Token::StringLit(s), _)) => assert_eq!(s, ""),
1667 other => panic!("Expected empty StringLit, got {:?}", other),
1668 }
1669 }
1670
1671 #[test]
1672 fn test_escape_sequence_helper() {
1673 assert_eq!(process_escape_sequences(r"hello\nworld"), "hello\nworld");
1675 assert_eq!(process_escape_sequences(r"hello\tworld"), "hello\tworld");
1676 assert_eq!(process_escape_sequences(r"hello\\world"), "hello\\world");
1677 assert_eq!(process_escape_sequences(r#"hello\"world"#), "hello\"world");
1678 assert_eq!(process_escape_sequences(r"hello\x41world"), "helloAworld");
1679 assert_eq!(
1680 process_escape_sequences(r"hello\u{1F600}world"),
1681 "hello😀world"
1682 );
1683 }
1684}