1use crate::{kind::PerlSyntaxKind, language::PerlLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'s, S> = LexerState<'s, S, PerlLanguage>;
10
11static PERL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static PERL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
13
14#[derive(Clone, Debug)]
15pub struct PerlLexer<'config> {
16 _config: &'config PerlLanguage,
17}
18
19impl<'config> PerlLexer<'config> {
20 pub fn new(config: &'config PerlLanguage) -> Self {
21 Self { _config: config }
22 }
23
24 fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
25 PERL_WHITESPACE.scan(state, PerlSyntaxKind::Whitespace)
26 }
27
28 fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
29 PERL_COMMENT.scan(state, PerlSyntaxKind::Comment, PerlSyntaxKind::Comment)
30 }
31
32 fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
33 let start_pos = state.get_position();
34
35 if let Some(quote_char) = state.peek() {
36 if quote_char == '"' || quote_char == '\'' {
37 state.advance(1); let mut escaped = false;
40 while let Some(ch) = state.peek() {
41 if escaped {
42 escaped = false;
43 state.advance(ch.len_utf8());
44 }
45 else if ch == '\\' {
46 escaped = true;
47 state.advance(1);
48 }
49 else if ch == quote_char {
50 state.advance(1); break;
52 }
53 else if ch == '\n' || ch == '\r' {
54 break;
56 }
57 else {
58 state.advance(ch.len_utf8());
59 }
60 }
61
62 state.add_token(PerlSyntaxKind::StringLiteral, start_pos, state.get_position());
63 true
64 }
65 else {
66 false
67 }
68 }
69 else {
70 false
71 }
72 }
73
74 fn lex_variable<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
75 if let Some(ch) = state.peek() {
76 let start_pos = state.get_position();
77
78 match ch {
79 '$' => {
80 state.advance(1);
81 while let Some(ch) = state.peek() {
83 if ch.is_alphanumeric() || ch == '_' {
84 state.advance(ch.len_utf8());
85 }
86 else {
87 break;
88 }
89 }
90 state.add_token(PerlSyntaxKind::Dollar, start_pos, state.get_position());
91 true
92 }
93 '@' => {
94 state.advance(1);
95 while let Some(ch) = state.peek() {
97 if ch.is_alphanumeric() || ch == '_' {
98 state.advance(ch.len_utf8());
99 }
100 else {
101 break;
102 }
103 }
104 state.add_token(PerlSyntaxKind::At, start_pos, state.get_position());
105 true
106 }
107 '%' => {
108 state.advance(1);
109 while let Some(ch) = state.peek() {
111 if ch.is_alphanumeric() || ch == '_' {
112 state.advance(ch.len_utf8());
113 }
114 else {
115 break;
116 }
117 }
118 state.add_token(PerlSyntaxKind::Percent_, start_pos, state.get_position());
119 true
120 }
121 _ => false,
122 }
123 }
124 else {
125 false
126 }
127 }
128
129 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
130 if let Some(ch) = state.peek() {
131 if ch.is_alphabetic() || ch == '_' {
132 let start_pos = state.get_position();
133 let mut text = String::new();
134
135 while let Some(ch) = state.peek() {
137 if ch.is_alphanumeric() || ch == '_' {
138 text.push(ch);
139 state.advance(ch.len_utf8());
140 }
141 else {
142 break;
143 }
144 }
145
146 let kind = match text.as_str() {
148 "if" => PerlSyntaxKind::If,
149 "else" => PerlSyntaxKind::Else,
150 "elsif" => PerlSyntaxKind::Elsif,
151 "unless" => PerlSyntaxKind::Unless,
152 "while" => PerlSyntaxKind::While,
153 "until" => PerlSyntaxKind::Until,
154 "for" => PerlSyntaxKind::For,
155 "foreach" => PerlSyntaxKind::Foreach,
156 "do" => PerlSyntaxKind::Do,
157 "sub" => PerlSyntaxKind::Sub,
158 "package" => PerlSyntaxKind::Package,
159 "use" => PerlSyntaxKind::Use,
160 "require" => PerlSyntaxKind::Require,
161 "my" => PerlSyntaxKind::My,
162 "our" => PerlSyntaxKind::Our,
163 "local" => PerlSyntaxKind::Local,
164 "return" => PerlSyntaxKind::Return,
165 "last" => PerlSyntaxKind::Last,
166 "next" => PerlSyntaxKind::Next,
167 "redo" => PerlSyntaxKind::Redo,
168 "die" => PerlSyntaxKind::Die,
169 "warn" => PerlSyntaxKind::Warn,
170 "eval" => PerlSyntaxKind::Eval,
171 "print" => PerlSyntaxKind::Print,
172 "printf" => PerlSyntaxKind::Printf,
173 "chomp" => PerlSyntaxKind::Chomp,
174 "chop" => PerlSyntaxKind::Chop,
175 "split" => PerlSyntaxKind::Split,
176 "join" => PerlSyntaxKind::Join,
177 "push" => PerlSyntaxKind::Push,
178 "pop" => PerlSyntaxKind::Pop,
179 "shift" => PerlSyntaxKind::Shift,
180 "unshift" => PerlSyntaxKind::Unshift,
181 "keys" => PerlSyntaxKind::Keys,
182 "values" => PerlSyntaxKind::Values,
183 "each" => PerlSyntaxKind::Each,
184 "exists" => PerlSyntaxKind::Exists,
185 "delete" => PerlSyntaxKind::Delete,
186 "defined" => PerlSyntaxKind::Defined,
187 "undef" => PerlSyntaxKind::Undef,
188 "ref" => PerlSyntaxKind::Ref,
189 "bless" => PerlSyntaxKind::Bless,
190 "new" => PerlSyntaxKind::New,
191 "and" => PerlSyntaxKind::And,
192 "or" => PerlSyntaxKind::Or,
193 "not" => PerlSyntaxKind::Not,
194 _ => PerlSyntaxKind::Identifier,
195 };
196
197 state.add_token(kind, start_pos, state.get_position());
198 true
199 }
200 else {
201 false
202 }
203 }
204 else {
205 false
206 }
207 }
208
209 fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
210 if let Some(ch) = state.peek() {
211 if ch.is_ascii_digit() {
212 let start_pos = state.get_position();
213 let mut has_dot = false;
214
215 while let Some(ch) = state.peek() {
217 if ch.is_ascii_digit() {
218 state.advance(1);
219 }
220 else if ch == '.' && !has_dot {
221 has_dot = true;
222 state.advance(1);
223 }
224 else {
225 break;
226 }
227 }
228
229 let kind = PerlSyntaxKind::NumberLiteral;
230
231 state.add_token(kind, start_pos, state.get_position());
232 true
233 }
234 else {
235 false
236 }
237 }
238 else {
239 false
240 }
241 }
242
243 fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
244 if let Some(ch) = state.peek() {
245 let start_pos = state.get_position();
246
247 let kind = match ch {
248 '+' => {
249 state.advance(1);
250 if let Some('+') = state.peek() {
251 state.advance(1);
252 PerlSyntaxKind::Increment
253 }
254 else if let Some('=') = state.peek() {
255 state.advance(1);
256 PerlSyntaxKind::PlusAssign
257 }
258 else {
259 PerlSyntaxKind::Plus
260 }
261 }
262 '-' => {
263 state.advance(1);
264 if let Some('-') = state.peek() {
265 state.advance(1);
266 PerlSyntaxKind::Decrement
267 }
268 else if let Some('=') = state.peek() {
269 state.advance(1);
270 PerlSyntaxKind::MinusAssign
271 }
272 else if let Some('>') = state.peek() {
273 state.advance(1);
274 PerlSyntaxKind::Arrow
275 }
276 else {
277 PerlSyntaxKind::Minus
278 }
279 }
280 '*' => {
281 state.advance(1);
282 if let Some('*') = state.peek() {
283 state.advance(1);
284 PerlSyntaxKind::Power
285 }
286 else if let Some('=') = state.peek() {
287 state.advance(1);
288 PerlSyntaxKind::MultiplyAssign
289 }
290 else {
291 PerlSyntaxKind::Star
292 }
293 }
294 '/' => {
295 state.advance(1);
296 if let Some('=') = state.peek() {
297 state.advance(1);
298 PerlSyntaxKind::DivideAssign
299 }
300 else {
301 PerlSyntaxKind::Slash
302 }
303 }
304 '=' => {
305 state.advance(1);
306 if let Some('=') = state.peek() {
307 state.advance(1);
308 PerlSyntaxKind::Equal
309 }
310 else if let Some('~') = state.peek() {
311 state.advance(1);
312 PerlSyntaxKind::Match
313 }
314 else {
315 PerlSyntaxKind::Assign
316 }
317 }
318 '!' => {
319 state.advance(1);
320 if let Some('=') = state.peek() {
321 state.advance(1);
322 PerlSyntaxKind::NotEqual
323 }
324 else if let Some('~') = state.peek() {
325 state.advance(1);
326 PerlSyntaxKind::NotMatch
327 }
328 else {
329 PerlSyntaxKind::Not
330 }
331 }
332 '<' => {
333 state.advance(1);
334 if let Some('=') = state.peek() {
335 state.advance(1);
336 if let Some('>') = state.peek() {
337 state.advance(1);
338 PerlSyntaxKind::Spaceship
339 }
340 else {
341 PerlSyntaxKind::LessEqual
342 }
343 }
344 else if let Some('<') = state.peek() {
345 state.advance(1);
346 PerlSyntaxKind::LeftShift
347 }
348 else {
349 PerlSyntaxKind::LessThan
350 }
351 }
352 '>' => {
353 state.advance(1);
354 if let Some('=') = state.peek() {
355 state.advance(1);
356 PerlSyntaxKind::GreaterEqual
357 }
358 else if let Some('>') = state.peek() {
359 state.advance(1);
360 PerlSyntaxKind::RightShift
361 }
362 else {
363 PerlSyntaxKind::GreaterThan
364 }
365 }
366 '&' => {
367 state.advance(1);
368 if let Some('&') = state.peek() {
369 state.advance(1);
370 PerlSyntaxKind::LogicalAnd
371 }
372 else {
373 PerlSyntaxKind::BitwiseAnd
374 }
375 }
376 '|' => {
377 state.advance(1);
378 if let Some('|') = state.peek() {
379 state.advance(1);
380 PerlSyntaxKind::LogicalOr
381 }
382 else {
383 PerlSyntaxKind::BitwiseOr
384 }
385 }
386 '^' => {
387 state.advance(1);
388 PerlSyntaxKind::BitwiseXor
389 }
390 '~' => {
391 state.advance(1);
392 PerlSyntaxKind::BitwiseNot
393 }
394 '.' => {
395 state.advance(1);
396 if let Some('.') = state.peek() {
397 state.advance(1);
398 PerlSyntaxKind::Range
399 }
400 else {
401 PerlSyntaxKind::Concat
402 }
403 }
404 '?' => {
405 state.advance(1);
406 PerlSyntaxKind::Question
407 }
408 ':' => {
409 state.advance(1);
410 PerlSyntaxKind::Colon
411 }
412 ';' => {
413 state.advance(1);
414 PerlSyntaxKind::Semicolon
415 }
416 ',' => {
417 state.advance(1);
418 PerlSyntaxKind::Comma
419 }
420 '(' => {
421 state.advance(1);
422 PerlSyntaxKind::LeftParen
423 }
424 ')' => {
425 state.advance(1);
426 PerlSyntaxKind::RightParen
427 }
428 '[' => {
429 state.advance(1);
430 PerlSyntaxKind::LeftBracket
431 }
432 ']' => {
433 state.advance(1);
434 PerlSyntaxKind::RightBracket
435 }
436 '{' => {
437 state.advance(1);
438 PerlSyntaxKind::LeftBrace
439 }
440 '}' => {
441 state.advance(1);
442 PerlSyntaxKind::RightBrace
443 }
444 '\n' => {
445 state.advance(1);
446 PerlSyntaxKind::Newline
447 }
448 _ => {
449 state.advance(ch.len_utf8());
450 PerlSyntaxKind::Error
451 }
452 };
453
454 state.add_token(kind, start_pos, state.get_position());
455 true
456 }
457 else {
458 false
459 }
460 }
461}
462
463impl<'config> Lexer<PerlLanguage> for PerlLexer<'config> {
464 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PerlLanguage>) -> LexOutput<PerlLanguage> {
465 let mut state = LexerState::new(source);
466 let result = self.run(&mut state);
467 if result.is_ok() {
468 state.add_eof();
469 }
470 state.finish_with_cache(result, cache)
471 }
472}
473
474impl<'config> PerlLexer<'config> {
475 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
476 while state.not_at_end() {
477 let safe_point = state.get_position();
478
479 if self.skip_whitespace(state) {
481 continue;
482 }
483
484 if self.skip_comment(state) {
486 continue;
487 }
488
489 if self.lex_string(state) {
491 continue;
492 }
493
494 if self.lex_variable(state) {
496 continue;
497 }
498
499 if self.lex_identifier_or_keyword(state) {
501 continue;
502 }
503
504 if self.lex_number(state) {
506 continue;
507 }
508
509 if self.lex_operators_and_punctuation(state) {
511 continue;
512 }
513
514 let start_pos = state.get_position();
516 if let Some(ch) = state.peek() {
517 state.advance(ch.len_utf8());
518 state.add_token(PerlSyntaxKind::Error, start_pos, state.get_position());
519 }
520
521 state.advance_if_dead_lock(safe_point);
522 }
523
524 Ok(())
525 }
526}