1use logos::{Lexer, Logos};
13use text_size::{TextRange, TextSize};
14
15use crate::SyntaxKind;
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub struct RawToken {
20 pub kind: SyntaxKind,
22 pub range: TextRange,
24}
25
26#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
30enum LexKind {
31 #[token("\u{feff}", priority = 10)]
36 Bom,
37 #[regex(r"[ \t]+")]
38 Whitespace,
39 #[regex(r"\r\n|\n|\r")]
40 NewlinePhys,
41 #[regex(r"\\(\r\n|\n|\r)")]
42 LineContinuation,
43 #[regex(r"#region[^\r\n]*", priority = 5, allow_greedy = true)]
47 RegionComment,
48 #[regex(r"#endregion[^\r\n]*", priority = 5, allow_greedy = true)]
49 EndRegionComment,
50 #[regex(r"##[^\r\n]*", priority = 4, allow_greedy = true)]
51 DocComment,
52 #[regex(r"#[^\r\n]*", priority = 2, allow_greedy = true)]
53 LineComment,
54
55 #[regex(r"0[xX][0-9a-fA-F_]+|0[bB][01_]+|[0-9][0-9_]*")]
57 Int,
58 #[regex(r"[0-9][0-9_]*\.[0-9_]*([eE][+-]?[0-9_]+)?|\.[0-9][0-9_]*([eE][+-]?[0-9_]+)?|[0-9][0-9_]*[eE][+-]?[0-9_]+")]
59 Float,
60 #[token("\"", lex_string)]
64 #[token("'", lex_string)]
65 #[token("\"\"\"", lex_string)]
66 #[token("'''", lex_string)]
67 #[token("r\"", lex_string)]
68 #[token("r'", lex_string)]
69 #[token("r\"\"\"", lex_string)]
70 #[token("r'''", lex_string)]
71 String,
72 #[token("&\"", lex_string)]
73 #[token("&'", lex_string)]
74 StringName,
75 #[token("^\"", lex_string)]
76 #[token("^'", lex_string)]
77 NodePath,
78 #[regex(r"[A-Za-z_][A-Za-z0-9_]*")]
79 Ident,
80
81 #[token("(")]
83 LParen,
84 #[token(")")]
85 RParen,
86 #[token("[")]
87 LBrack,
88 #[token("]")]
89 RBrack,
90 #[token("{")]
91 LBrace,
92 #[token("}")]
93 RBrace,
94 #[token(",")]
95 Comma,
96 #[token(":")]
97 Colon,
98 #[token(";")]
99 Semicolon,
100 #[token(".")]
101 Dot,
102 #[token("..")]
103 DotDot,
104 #[token("...")]
105 Ellipsis,
106 #[token("@")]
107 At,
108 #[token("$")]
109 Dollar,
110 #[token("%")]
111 Percent,
112 #[token("&")]
113 Amp,
114 #[token("->")]
115 Arrow,
116 #[token(":=")]
117 ColonEq,
118
119 #[token("+")]
121 Plus,
122 #[token("-")]
123 Minus,
124 #[token("*")]
125 Star,
126 #[token("/")]
127 Slash,
128 #[token("**")]
129 StarStar,
130 #[token("=")]
131 Eq,
132 #[token("==")]
133 EqEq,
134 #[token("!=")]
135 Neq,
136 #[token("<")]
137 Lt,
138 #[token(">")]
139 Gt,
140 #[token("<=")]
141 Le,
142 #[token(">=")]
143 Ge,
144 #[token("&&")]
145 AmpAmp,
146 #[token("||")]
147 PipePipe,
148 #[token("!")]
149 Bang,
150 #[token("~")]
151 Tilde,
152 #[token("|")]
153 Pipe,
154 #[token("^")]
155 Caret,
156 #[token("<<")]
157 Shl,
158 #[token(">>")]
159 Shr,
160 #[token("+=")]
161 PlusEq,
162 #[token("-=")]
163 MinusEq,
164 #[token("*=")]
165 StarEq,
166 #[token("/=")]
167 SlashEq,
168 #[token("**=")]
169 StarStarEq,
170 #[token("%=")]
171 PercentEq,
172 #[token("&=")]
173 AmpEq,
174 #[token("|=")]
175 PipeEq,
176 #[token("^=")]
177 CaretEq,
178 #[token("<<=")]
179 ShlEq,
180 #[token(">>=")]
181 ShrEq,
182}
183
184fn lex_string(lex: &mut Lexer<LexKind>) {
189 let opener = lex.slice().as_bytes();
190 let quote = opener[opener.len() - 1];
191 let triple =
192 opener.len() >= 3 && opener[opener.len() - 2] == quote && opener[opener.len() - 3] == quote;
193
194 let rem = lex.remainder().as_bytes();
195 let n = rem.len();
196 let mut i = 0usize;
197 while i < n {
198 let c = rem[i];
199 if c == b'\\' {
200 i += 2; continue;
202 }
203 if triple {
204 if c == quote && i + 2 < n && rem[i + 1] == quote && rem[i + 2] == quote {
205 i += 3; break;
207 }
208 } else {
209 if c == quote {
210 i += 1; break;
212 }
213 if c == b'\n' || c == b'\r' {
214 break; }
216 }
217 i += 1;
218 }
219 lex.bump(i.min(n));
220}
221
222#[must_use]
226pub fn tokenize(src: &str) -> Vec<RawToken> {
227 let mut out = Vec::new();
228 let mut lexer = LexKind::lexer(src);
229 while let Some(result) = lexer.next() {
230 let span = lexer.span();
231 let kind = match result {
232 Ok(lex_kind) => map_kind(lex_kind, &src[span.clone()]),
233 Err(()) => SyntaxKind::Error,
234 };
235 out.push(RawToken {
236 kind,
237 range: TextRange::new(text_size(span.start), text_size(span.end)),
238 });
239 }
240 out
241}
242
243fn text_size(offset: usize) -> TextSize {
245 TextSize::new(u32::try_from(offset).expect("source files must be smaller than 4 GiB"))
246}
247
248fn map_kind(kind: LexKind, text: &str) -> SyntaxKind {
251 use LexKind as L;
252 use SyntaxKind as S;
253 match kind {
254 L::Bom => S::Bom,
255 L::Whitespace => S::Whitespace,
256 L::NewlinePhys => S::NewlinePhys,
257 L::LineContinuation => S::LineContinuation,
258 L::RegionComment => S::RegionComment,
259 L::EndRegionComment => S::EndRegionComment,
260 L::DocComment => S::DocComment,
261 L::LineComment => S::LineComment,
262 L::Int => S::Int,
263 L::Float => S::Float,
264 L::String => S::String,
265 L::StringName => S::StringName,
266 L::NodePath => S::NodePath,
267 L::Ident => reclassify_ident(text),
268 L::LParen => S::LParen,
269 L::RParen => S::RParen,
270 L::LBrack => S::LBrack,
271 L::RBrack => S::RBrack,
272 L::LBrace => S::LBrace,
273 L::RBrace => S::RBrace,
274 L::Comma => S::Comma,
275 L::Colon => S::Colon,
276 L::Semicolon => S::Semicolon,
277 L::Dot => S::Dot,
278 L::DotDot => S::DotDot,
279 L::Ellipsis => S::Ellipsis,
280 L::At => S::At,
281 L::Dollar => S::Dollar,
282 L::Percent => S::Percent,
283 L::Amp => S::Amp,
284 L::Arrow => S::Arrow,
285 L::ColonEq => S::ColonEq,
286 L::Plus => S::Plus,
287 L::Minus => S::Minus,
288 L::Star => S::Star,
289 L::Slash => S::Slash,
290 L::StarStar => S::StarStar,
291 L::Eq => S::Eq,
292 L::EqEq => S::EqEq,
293 L::Neq => S::Neq,
294 L::Lt => S::Lt,
295 L::Gt => S::Gt,
296 L::Le => S::Le,
297 L::Ge => S::Ge,
298 L::AmpAmp => S::AmpAmp,
299 L::PipePipe => S::PipePipe,
300 L::Bang => S::Bang,
301 L::Tilde => S::Tilde,
302 L::Pipe => S::Pipe,
303 L::Caret => S::Caret,
304 L::Shl => S::Shl,
305 L::Shr => S::Shr,
306 L::PlusEq => S::PlusEq,
307 L::MinusEq => S::MinusEq,
308 L::StarEq => S::StarEq,
309 L::SlashEq => S::SlashEq,
310 L::StarStarEq => S::StarStarEq,
311 L::PercentEq => S::PercentEq,
312 L::AmpEq => S::AmpEq,
313 L::PipeEq => S::PipeEq,
314 L::CaretEq => S::CaretEq,
315 L::ShlEq => S::ShlEq,
316 L::ShrEq => S::ShrEq,
317 }
318}
319
320fn reclassify_ident(text: &str) -> SyntaxKind {
325 use SyntaxKind as S;
326 match text {
327 "if" => S::IfKw,
328 "elif" => S::ElifKw,
329 "else" => S::ElseKw,
330 "for" => S::ForKw,
331 "while" => S::WhileKw,
332 "match" => S::MatchKw,
333 "when" => S::WhenKw,
334 "break" => S::BreakKw,
335 "continue" => S::ContinueKw,
336 "pass" => S::PassKw,
337 "return" => S::ReturnKw,
338 "var" => S::VarKw,
339 "const" => S::ConstKw,
340 "enum" => S::EnumKw,
341 "func" => S::FuncKw,
342 "static" => S::StaticKw,
343 "signal" => S::SignalKw,
344 "class" => S::ClassKw,
345 "class_name" => S::ClassNameKw,
346 "extends" => S::ExtendsKw,
347 "is" => S::IsKw,
348 "in" => S::InKw,
349 "as" => S::AsKw,
350 "self" => S::SelfKw,
351 "super" => S::SuperKw,
352 "void" => S::VoidKw,
353 "await" => S::AwaitKw,
354 "preload" => S::PreloadKw,
355 "assert" => S::AssertKw,
356 "breakpoint" => S::BreakpointKw,
357 "not" => S::NotKw,
358 "and" => S::AndKw,
359 "or" => S::OrKw,
360 "yield" => S::YieldKw,
361 "namespace" => S::NamespaceKw,
362 "trait" => S::TraitKw,
363 "true" => S::True,
364 "false" => S::False,
365 "null" => S::Null,
366 "PI" => S::ConstPi,
367 "TAU" => S::ConstTau,
368 "INF" => S::ConstInf,
369 "NAN" => S::ConstNan,
370 _ => S::Ident,
371 }
372}
373
374#[cfg(test)]
375mod tests {
376 use super::*;
377
378 fn assert_lossless(src: &str) {
380 let toks = tokenize(src);
381 let mut prev_end = TextSize::new(0);
383 let mut rebuilt = String::new();
384 for t in &toks {
385 assert_eq!(
386 t.range.start(),
387 prev_end,
388 "gap/overlap before {t:?} in {src:?}"
389 );
390 prev_end = t.range.end();
391 rebuilt.push_str(&src[t.range]);
392 }
393 assert_eq!(prev_end, TextSize::of(src), "did not cover to EOF: {src:?}");
394 assert_eq!(rebuilt, src, "round-trip mismatch for {src:?}");
395 }
396
397 fn kinds(src: &str) -> Vec<SyntaxKind> {
398 tokenize(src).into_iter().map(|t| t.kind).collect()
399 }
400
401 #[test]
402 fn lossless_over_a_realistic_snippet() {
403 let src = "## doc\n@export var hp: int = 100 # hi\nfunc _ready() -> void:\n\tprint($Player, %Unique)\n";
404 assert_lossless(src);
405 }
406
407 #[test]
408 fn keywords_and_literals_reclassified() {
409 use SyntaxKind as S;
410 assert_eq!(kinds("func"), vec![S::FuncKw]);
411 assert_eq!(
412 kinds("true false null"),
413 vec![S::True, S::Whitespace, S::False, S::Whitespace, S::Null]
414 );
415 assert_eq!(kinds("PI"), vec![S::ConstPi]);
416 assert_eq!(kinds("my_var"), vec![S::Ident]);
417 assert_eq!(kinds("class_name"), vec![S::ClassNameKw]);
418 }
419
420 #[test]
421 fn numbers() {
422 use SyntaxKind as S;
423 assert_eq!(kinds("0x8f51"), vec![S::Int]);
424 assert_eq!(kinds("0b1010"), vec![S::Int]);
425 assert_eq!(kinds("12_345"), vec![S::Int]);
426 assert_eq!(kinds("3.14"), vec![S::Float]);
427 assert_eq!(kinds(".5"), vec![S::Float]);
428 assert_eq!(kinds("1."), vec![S::Float]);
429 assert_eq!(kinds("58.1e-10"), vec![S::Float]);
430 }
431
432 #[test]
433 fn strings_all_flavours() {
434 use SyntaxKind as S;
435 assert_eq!(kinds(r#""hello""#), vec![S::String]);
436 assert_eq!(kinds("'world'"), vec![S::String]);
437 assert_eq!(kinds(r#""with \" escape""#), vec![S::String]);
438 assert_eq!(kinds(r#"r"raw\n""#), vec![S::String]);
439 assert_eq!(kinds("\"\"\"multi\nline\"\"\""), vec![S::String]);
440 assert_eq!(kinds(r#"&"sname""#), vec![S::StringName]);
441 assert_eq!(kinds(r#"^"node/path""#), vec![S::NodePath]);
442 assert_eq!(kinds(r#"$"Player""#), vec![S::Dollar, S::String]);
444 }
445
446 #[test]
447 fn unterminated_string_is_lossless() {
448 let src = "\"oops\nok";
450 assert_lossless(src);
451 assert_eq!(kinds(src)[0], SyntaxKind::String);
452 assert_lossless("\"\"\"never closed");
454 }
455
456 #[test]
457 fn operators_longest_match() {
458 use SyntaxKind as S;
459 assert_eq!(kinds("**="), vec![S::StarStarEq]);
460 assert_eq!(kinds(">>="), vec![S::ShrEq]);
461 assert_eq!(kinds(":="), vec![S::ColonEq]);
462 assert_eq!(kinds("->"), vec![S::Arrow]);
463 assert_eq!(kinds("..."), vec![S::Ellipsis]);
464 assert_eq!(kinds("&&"), vec![S::AmpAmp]);
465 }
466
467 #[test]
468 fn unlexable_byte_becomes_error_token() {
469 let src = "a ` b";
471 assert_lossless(src);
472 assert!(kinds(src).contains(&SyntaxKind::Error));
473 }
474
475 #[test]
476 fn comments_distinguished() {
477 use SyntaxKind as S;
478 assert_eq!(kinds("# plain"), vec![S::LineComment]);
479 assert_eq!(kinds("## doc"), vec![S::DocComment]);
480 assert_eq!(kinds("#region A"), vec![S::RegionComment]);
481 assert_eq!(kinds("#endregion"), vec![S::EndRegionComment]);
482 }
483}