1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5use crate::{language::PerlLanguage, lexer::token_type::PerlTokenType};
6use oak_core::{
7 Lexer, LexerCache, LexerState, OakError,
8 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
9 source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, PerlLanguage>;
15
16static PERL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
17static PERL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
18
19#[derive(Clone, Debug)]
23pub struct PerlLexer<'config> {
24 pub config: &'config PerlLanguage,
26}
27
28impl<'config> PerlLexer<'config> {
29 pub fn new(config: &'config PerlLanguage) -> Self {
31 Self { config }
32 }
33
34 fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
36 PERL_WHITESPACE.scan(state, PerlTokenType::Whitespace)
37 }
38
39 fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
41 PERL_COMMENT.scan(state, PerlTokenType::Comment, PerlTokenType::Comment)
42 }
43
44 fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
46 let start_pos = state.get_position();
47
48 if let Some(quote_char) = state.peek() {
49 if quote_char == '"' || quote_char == '\'' {
50 state.advance(1); let mut escaped = false;
53 while let Some(ch) = state.peek() {
54 if escaped {
55 escaped = false;
56 state.advance(ch.len_utf8())
57 }
58 else if ch == '\\' {
59 escaped = true;
60 state.advance(1)
61 }
62 else if ch == quote_char {
63 state.advance(1); break;
65 }
66 else if ch == '\n' || ch == '\r' {
67 break;
69 }
70 else {
71 state.advance(ch.len_utf8())
72 }
73 }
74
75 state.add_token(PerlTokenType::StringLiteral, start_pos, state.get_position());
76 true
77 }
78 else {
79 false
80 }
81 }
82 else {
83 false
84 }
85 }
86
87 fn lex_variable<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89 if let Some(ch) = state.peek() {
90 let start_pos = state.get_position();
91
92 match ch {
93 '$' => {
94 state.advance(1);
95 while let Some(ch) = state.peek() {
97 if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
98 }
99 state.add_token(PerlTokenType::Dollar, start_pos, state.get_position());
100 true
101 }
102 '@' => {
103 state.advance(1);
104 while let Some(ch) = state.peek() {
106 if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
107 }
108 state.add_token(PerlTokenType::At, start_pos, state.get_position());
109 true
110 }
111 '%' => {
112 state.advance(1);
113 while let Some(ch) = state.peek() {
115 if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
116 }
117 state.add_token(PerlTokenType::Percent_, start_pos, state.get_position());
118 true
119 }
120 _ => false,
121 }
122 }
123 else {
124 false
125 }
126 }
127
128 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
130 if let Some(ch) = state.peek() {
131 if ch.is_alphabetic() || ch == '_' {
132 let start_pos = state.get_position();
133 let mut text = String::new();
134
135 while let Some(ch) = state.peek() {
137 if ch.is_alphanumeric() || ch == '_' {
138 text.push(ch);
139 state.advance(ch.len_utf8())
140 }
141 else {
142 break;
143 }
144 }
145
146 let kind = match text.as_str() {
148 "if" => PerlTokenType::If,
149 "else" => PerlTokenType::Else,
150 "elsif" => PerlTokenType::Elsif,
151 "unless" => PerlTokenType::Unless,
152 "while" => PerlTokenType::While,
153 "until" => PerlTokenType::Until,
154 "for" => PerlTokenType::For,
155 "foreach" => PerlTokenType::Foreach,
156 "do" => PerlTokenType::Do,
157 "sub" => PerlTokenType::Sub,
158 "package" => PerlTokenType::Package,
159 "use" => PerlTokenType::Use,
160 "require" => PerlTokenType::Require,
161 "my" => PerlTokenType::My,
162 "our" => PerlTokenType::Our,
163 "local" => PerlTokenType::Local,
164 "return" => PerlTokenType::Return,
165 "last" => PerlTokenType::Last,
166 "next" => PerlTokenType::Next,
167 "redo" => PerlTokenType::Redo,
168 "die" => PerlTokenType::Die,
169 "warn" => PerlTokenType::Warn,
170 "eval" => PerlTokenType::Eval,
171 "print" => PerlTokenType::Print,
172 "printf" => PerlTokenType::Printf,
173 "chomp" => PerlTokenType::Chomp,
174 "chop" => PerlTokenType::Chop,
175 "split" => PerlTokenType::Split,
176 "join" => PerlTokenType::Join,
177 "push" => PerlTokenType::Push,
178 "pop" => PerlTokenType::Pop,
179 "shift" => PerlTokenType::Shift,
180 "unshift" => PerlTokenType::Unshift,
181 "keys" => PerlTokenType::Keys,
182 "values" => PerlTokenType::Values,
183 "each" => PerlTokenType::Each,
184 "exists" => PerlTokenType::Exists,
185 "delete" => PerlTokenType::Delete,
186 "defined" => PerlTokenType::Defined,
187 "undef" => PerlTokenType::Undef,
188 "ref" => PerlTokenType::Ref,
189 "bless" => PerlTokenType::Bless,
190 "new" => PerlTokenType::New,
191 "and" => PerlTokenType::And,
192 "or" => PerlTokenType::Or,
193 "not" => PerlTokenType::Not,
194 _ => PerlTokenType::Identifier,
195 };
196
197 state.add_token(kind, start_pos, state.get_position());
198 true
199 }
200 else {
201 false
202 }
203 }
204 else {
205 false
206 }
207 }
208
209 fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
211 if let Some(ch) = state.peek() {
212 if ch.is_ascii_digit() {
213 let start_pos = state.get_position();
214 let mut has_dot = false;
215
216 while let Some(ch) = state.peek() {
218 if ch.is_ascii_digit() {
219 state.advance(1)
220 }
221 else if ch == '.' && !has_dot {
222 has_dot = true;
223 state.advance(1)
224 }
225 else {
226 break;
227 }
228 }
229
230 let kind = PerlTokenType::NumberLiteral;
231
232 state.add_token(kind, start_pos, state.get_position());
233 true
234 }
235 else {
236 false
237 }
238 }
239 else {
240 false
241 }
242 }
243
244 fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
246 if let Some(ch) = state.peek() {
247 let start_pos = state.get_position();
248
249 let kind = match ch {
250 '+' => {
251 state.advance(1);
252 if let Some('+') = state.peek() {
253 state.advance(1);
254 PerlTokenType::Increment
255 }
256 else if let Some('=') = state.peek() {
257 state.advance(1);
258 PerlTokenType::PlusAssign
259 }
260 else {
261 PerlTokenType::Plus
262 }
263 }
264 '-' => {
265 state.advance(1);
266 if let Some('-') = state.peek() {
267 state.advance(1);
268 PerlTokenType::Decrement
269 }
270 else if let Some('=') = state.peek() {
271 state.advance(1);
272 PerlTokenType::MinusAssign
273 }
274 else if let Some('>') = state.peek() {
275 state.advance(1);
276 PerlTokenType::Arrow
277 }
278 else {
279 PerlTokenType::Minus
280 }
281 }
282 '*' => {
283 state.advance(1);
284 if let Some('*') = state.peek() {
285 state.advance(1);
286 PerlTokenType::Power
287 }
288 else if let Some('=') = state.peek() {
289 state.advance(1);
290 PerlTokenType::MultiplyAssign
291 }
292 else {
293 PerlTokenType::Star
294 }
295 }
296 '/' => {
297 state.advance(1);
298 if let Some('=') = state.peek() {
299 state.advance(1);
300 PerlTokenType::DivideAssign
301 }
302 else {
303 PerlTokenType::Slash
304 }
305 }
306 '%' => {
307 state.advance(1);
308 if let Some('=') = state.peek() {
309 state.advance(1);
310 PerlTokenType::ModuloAssign
311 }
312 else {
313 PerlTokenType::Percent
314 }
315 }
316 '=' => {
317 state.advance(1);
318 if let Some('=') = state.peek() {
319 state.advance(1);
320 if let Some('>') = state.peek() {
321 state.advance(1);
322 PerlTokenType::FatArrow
323 }
324 else {
325 PerlTokenType::Equal
326 }
327 }
328 else if let Some('~') = state.peek() {
329 state.advance(1);
330 PerlTokenType::Match
331 }
332 else {
333 PerlTokenType::Assign
334 }
335 }
336 '<' => {
337 state.advance(1);
338 if let Some('<') = state.peek() {
339 state.advance(1);
340 PerlTokenType::LeftShift
341 }
342 else if let Some('=') = state.peek() {
343 state.advance(1);
344 if let Some('>') = state.peek() {
345 state.advance(1);
346 PerlTokenType::Spaceship
347 }
348 else {
349 PerlTokenType::LessEqual
350 }
351 }
352 else {
353 PerlTokenType::LessThan
354 }
355 }
356 '>' => {
357 state.advance(1);
358 if let Some('>') = state.peek() {
359 state.advance(1);
360 PerlTokenType::RightShift
361 }
362 else if let Some('=') = state.peek() {
363 state.advance(1);
364 PerlTokenType::GreaterEqual
365 }
366 else {
367 PerlTokenType::GreaterThan
368 }
369 }
370 '!' => {
371 state.advance(1);
372 if let Some('=') = state.peek() {
373 state.advance(1);
374 PerlTokenType::NotEqual
375 }
376 else if let Some('~') = state.peek() {
377 state.advance(1);
378 PerlTokenType::NotMatch
379 }
380 else {
381 PerlTokenType::LogicalNot
382 }
383 }
384 '&' => {
385 state.advance(1);
386 PerlTokenType::BitwiseAnd
387 }
388 '|' => {
389 state.advance(1);
390 PerlTokenType::BitwiseOr
391 }
392 '^' => {
393 state.advance(1);
394 PerlTokenType::BitwiseXor
395 }
396 '~' => {
397 state.advance(1);
398 PerlTokenType::BitwiseNot
399 }
400 '.' => {
401 state.advance(1);
402 if let Some('.') = state.peek() {
403 state.advance(1);
404 PerlTokenType::Range
405 }
406 else {
407 PerlTokenType::Concat
408 }
409 }
410 '?' => {
411 state.advance(1);
412 PerlTokenType::Question
413 }
414 ':' => {
415 state.advance(1);
416 PerlTokenType::Colon
417 }
418 ';' => {
419 state.advance(1);
420 PerlTokenType::Semicolon
421 }
422 ',' => {
423 state.advance(1);
424 PerlTokenType::Comma
425 }
426 '(' => {
427 state.advance(1);
428 PerlTokenType::LeftParen
429 }
430 ')' => {
431 state.advance(1);
432 PerlTokenType::RightParen
433 }
434 '[' => {
435 state.advance(1);
436 PerlTokenType::LeftBracket
437 }
438 ']' => {
439 state.advance(1);
440 PerlTokenType::RightBracket
441 }
442 '{' => {
443 state.advance(1);
444 PerlTokenType::LeftBrace
445 }
446 '}' => {
447 state.advance(1);
448 PerlTokenType::RightBrace
449 }
450 '\n' => {
451 state.advance(1);
452 PerlTokenType::Newline
453 }
454 _ => {
455 state.advance(ch.len_utf8());
456 PerlTokenType::Error
457 }
458 };
459
460 state.add_token(kind, start_pos, state.get_position());
461 true
462 }
463 else {
464 false
465 }
466 }
467}
468
469impl<'config> Lexer<PerlLanguage> for PerlLexer<'config> {
470 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PerlLanguage>) -> LexOutput<PerlLanguage> {
471 let mut state = LexerState::new(source);
472 let result = self.run(&mut state);
473 if result.is_ok() {
474 state.add_eof();
475 }
476 state.finish_with_cache(result, cache)
477 }
478}
479
480impl<'config> PerlLexer<'config> {
481 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
483 while state.not_at_end() {
484 let safe_point = state.get_position();
485
486 if self.skip_whitespace(state) {
488 continue;
489 }
490
491 if self.skip_comment(state) {
493 continue;
494 }
495
496 if self.lex_string(state) {
498 continue;
499 }
500
501 if self.lex_variable(state) {
503 continue;
504 }
505
506 if self.lex_identifier_or_keyword(state) {
508 continue;
509 }
510
511 if self.lex_number(state) {
513 continue;
514 }
515
516 if self.lex_operators_and_punctuation(state) {
518 continue;
519 }
520
521 let start_pos = state.get_position();
523 if let Some(ch) = state.peek() {
524 state.advance(ch.len_utf8());
525 state.add_token(PerlTokenType::Error, start_pos, state.get_position())
526 }
527
528 state.advance_if_dead_lock(safe_point)
529 }
530
531 Ok(())
532 }
533}