Skip to main content

oak_perl/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions for Perl.
3pub mod token_type;
4
5use crate::{language::PerlLanguage, lexer::token_type::PerlTokenType};
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
9    source::Source,
10};
11use std::sync::LazyLock;
12
13/// The lexer state type for Perl.
14type State<'s, S> = LexerState<'s, S, PerlLanguage>;
15
16static PERL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
17static PERL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
18
19/// Lexer for the Perl language.
20///
21/// This lexer converts a source string into a stream of [`PerlTokenType`] tokens.
22#[derive(Clone, Debug)]
23pub struct PerlLexer<'config> {
24    /// The Perl language configuration.
25    pub config: &'config PerlLanguage,
26}
27
28impl<'config> PerlLexer<'config> {
29    /// Creates a new `PerlLexer` with the given language configuration.
30    pub fn new(config: &'config PerlLanguage) -> Self {
31        Self { config }
32    }
33
34    /// Skips whitespace characters.
35    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
36        PERL_WHITESPACE.scan(state, PerlTokenType::Whitespace)
37    }
38
39    /// Skips single-line comments.
40    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
41        PERL_COMMENT.scan(state, PerlTokenType::Comment, PerlTokenType::Comment)
42    }
43
44    /// Analyzes string literals (single or double quotes).
45    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
46        let start_pos = state.get_position();
47
48        if let Some(quote_char) = state.peek() {
49            if quote_char == '"' || quote_char == '\'' {
50                state.advance(1); // Skip opening quote
51
52                let mut escaped = false;
53                while let Some(ch) = state.peek() {
54                    if escaped {
55                        escaped = false;
56                        state.advance(ch.len_utf8())
57                    }
58                    else if ch == '\\' {
59                        escaped = true;
60                        state.advance(1)
61                    }
62                    else if ch == quote_char {
63                        state.advance(1); // Skip closing quote
64                        break;
65                    }
66                    else if ch == '\n' || ch == '\r' {
67                        // Strings cannot span lines unless escaped
68                        break;
69                    }
70                    else {
71                        state.advance(ch.len_utf8())
72                    }
73                }
74
75                state.add_token(PerlTokenType::StringLiteral, start_pos, state.get_position());
76                true
77            }
78            else {
79                false
80            }
81        }
82        else {
83            false
84        }
85    }
86
87    /// Analyzes variable names (starting with $, @ or %).
88    fn lex_variable<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89        if let Some(ch) = state.peek() {
90            let start_pos = state.get_position();
91
92            match ch {
93                '$' => {
94                    state.advance(1);
95                    // Read variable name
96                    while let Some(ch) = state.peek() {
97                        if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
98                    }
99                    state.add_token(PerlTokenType::Dollar, start_pos, state.get_position());
100                    true
101                }
102                '@' => {
103                    state.advance(1);
104                    // Read array variable name
105                    while let Some(ch) = state.peek() {
106                        if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
107                    }
108                    state.add_token(PerlTokenType::At, start_pos, state.get_position());
109                    true
110                }
111                '%' => {
112                    state.advance(1);
113                    // Read hash variable name
114                    while let Some(ch) = state.peek() {
115                        if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
116                    }
117                    state.add_token(PerlTokenType::Percent_, start_pos, state.get_position());
118                    true
119                }
120                _ => false,
121            }
122        }
123        else {
124            false
125        }
126    }
127
128    /// Analyzes identifiers or keywords.
129    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
130        if let Some(ch) = state.peek() {
131            if ch.is_alphabetic() || ch == '_' {
132                let start_pos = state.get_position();
133                let mut text = String::new();
134
135                // Read identifier
136                while let Some(ch) = state.peek() {
137                    if ch.is_alphanumeric() || ch == '_' {
138                        text.push(ch);
139                        state.advance(ch.len_utf8())
140                    }
141                    else {
142                        break;
143                    }
144                }
145
146                // Check if it's a keyword
147                let kind = match text.as_str() {
148                    "if" => PerlTokenType::If,
149                    "else" => PerlTokenType::Else,
150                    "elsif" => PerlTokenType::Elsif,
151                    "unless" => PerlTokenType::Unless,
152                    "while" => PerlTokenType::While,
153                    "until" => PerlTokenType::Until,
154                    "for" => PerlTokenType::For,
155                    "foreach" => PerlTokenType::Foreach,
156                    "do" => PerlTokenType::Do,
157                    "sub" => PerlTokenType::Sub,
158                    "package" => PerlTokenType::Package,
159                    "use" => PerlTokenType::Use,
160                    "require" => PerlTokenType::Require,
161                    "my" => PerlTokenType::My,
162                    "our" => PerlTokenType::Our,
163                    "local" => PerlTokenType::Local,
164                    "return" => PerlTokenType::Return,
165                    "last" => PerlTokenType::Last,
166                    "next" => PerlTokenType::Next,
167                    "redo" => PerlTokenType::Redo,
168                    "die" => PerlTokenType::Die,
169                    "warn" => PerlTokenType::Warn,
170                    "eval" => PerlTokenType::Eval,
171                    "print" => PerlTokenType::Print,
172                    "printf" => PerlTokenType::Printf,
173                    "chomp" => PerlTokenType::Chomp,
174                    "chop" => PerlTokenType::Chop,
175                    "split" => PerlTokenType::Split,
176                    "join" => PerlTokenType::Join,
177                    "push" => PerlTokenType::Push,
178                    "pop" => PerlTokenType::Pop,
179                    "shift" => PerlTokenType::Shift,
180                    "unshift" => PerlTokenType::Unshift,
181                    "keys" => PerlTokenType::Keys,
182                    "values" => PerlTokenType::Values,
183                    "each" => PerlTokenType::Each,
184                    "exists" => PerlTokenType::Exists,
185                    "delete" => PerlTokenType::Delete,
186                    "defined" => PerlTokenType::Defined,
187                    "undef" => PerlTokenType::Undef,
188                    "ref" => PerlTokenType::Ref,
189                    "bless" => PerlTokenType::Bless,
190                    "new" => PerlTokenType::New,
191                    "and" => PerlTokenType::And,
192                    "or" => PerlTokenType::Or,
193                    "not" => PerlTokenType::Not,
194                    _ => PerlTokenType::Identifier,
195                };
196
197                state.add_token(kind, start_pos, state.get_position());
198                true
199            }
200            else {
201                false
202            }
203        }
204        else {
205            false
206        }
207    }
208
209    /// Analyzes number literals.
210    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
211        if let Some(ch) = state.peek() {
212            if ch.is_ascii_digit() {
213                let start_pos = state.get_position();
214                let mut has_dot = false;
215
216                // Read number
217                while let Some(ch) = state.peek() {
218                    if ch.is_ascii_digit() {
219                        state.advance(1)
220                    }
221                    else if ch == '.' && !has_dot {
222                        has_dot = true;
223                        state.advance(1)
224                    }
225                    else {
226                        break;
227                    }
228                }
229
230                let kind = PerlTokenType::NumberLiteral;
231
232                state.add_token(kind, start_pos, state.get_position());
233                true
234            }
235            else {
236                false
237            }
238        }
239        else {
240            false
241        }
242    }
243
244    /// Analyzes operators and punctuation.
245    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
246        if let Some(ch) = state.peek() {
247            let start_pos = state.get_position();
248
249            let kind = match ch {
250                '+' => {
251                    state.advance(1);
252                    if let Some('+') = state.peek() {
253                        state.advance(1);
254                        PerlTokenType::Increment
255                    }
256                    else if let Some('=') = state.peek() {
257                        state.advance(1);
258                        PerlTokenType::PlusAssign
259                    }
260                    else {
261                        PerlTokenType::Plus
262                    }
263                }
264                '-' => {
265                    state.advance(1);
266                    if let Some('-') = state.peek() {
267                        state.advance(1);
268                        PerlTokenType::Decrement
269                    }
270                    else if let Some('=') = state.peek() {
271                        state.advance(1);
272                        PerlTokenType::MinusAssign
273                    }
274                    else if let Some('>') = state.peek() {
275                        state.advance(1);
276                        PerlTokenType::Arrow
277                    }
278                    else {
279                        PerlTokenType::Minus
280                    }
281                }
282                '*' => {
283                    state.advance(1);
284                    if let Some('*') = state.peek() {
285                        state.advance(1);
286                        PerlTokenType::Power
287                    }
288                    else if let Some('=') = state.peek() {
289                        state.advance(1);
290                        PerlTokenType::MultiplyAssign
291                    }
292                    else {
293                        PerlTokenType::Star
294                    }
295                }
296                '/' => {
297                    state.advance(1);
298                    if let Some('=') = state.peek() {
299                        state.advance(1);
300                        PerlTokenType::DivideAssign
301                    }
302                    else {
303                        PerlTokenType::Slash
304                    }
305                }
306                '%' => {
307                    state.advance(1);
308                    if let Some('=') = state.peek() {
309                        state.advance(1);
310                        PerlTokenType::ModuloAssign
311                    }
312                    else {
313                        PerlTokenType::Percent
314                    }
315                }
316                '=' => {
317                    state.advance(1);
318                    if let Some('=') = state.peek() {
319                        state.advance(1);
320                        if let Some('>') = state.peek() {
321                            state.advance(1);
322                            PerlTokenType::FatArrow
323                        }
324                        else {
325                            PerlTokenType::Equal
326                        }
327                    }
328                    else if let Some('~') = state.peek() {
329                        state.advance(1);
330                        PerlTokenType::Match
331                    }
332                    else {
333                        PerlTokenType::Assign
334                    }
335                }
336                '<' => {
337                    state.advance(1);
338                    if let Some('<') = state.peek() {
339                        state.advance(1);
340                        PerlTokenType::LeftShift
341                    }
342                    else if let Some('=') = state.peek() {
343                        state.advance(1);
344                        if let Some('>') = state.peek() {
345                            state.advance(1);
346                            PerlTokenType::Spaceship
347                        }
348                        else {
349                            PerlTokenType::LessEqual
350                        }
351                    }
352                    else {
353                        PerlTokenType::LessThan
354                    }
355                }
356                '>' => {
357                    state.advance(1);
358                    if let Some('>') = state.peek() {
359                        state.advance(1);
360                        PerlTokenType::RightShift
361                    }
362                    else if let Some('=') = state.peek() {
363                        state.advance(1);
364                        PerlTokenType::GreaterEqual
365                    }
366                    else {
367                        PerlTokenType::GreaterThan
368                    }
369                }
370                '!' => {
371                    state.advance(1);
372                    if let Some('=') = state.peek() {
373                        state.advance(1);
374                        PerlTokenType::NotEqual
375                    }
376                    else if let Some('~') = state.peek() {
377                        state.advance(1);
378                        PerlTokenType::NotMatch
379                    }
380                    else {
381                        PerlTokenType::LogicalNot
382                    }
383                }
384                '&' => {
385                    state.advance(1);
386                    PerlTokenType::BitwiseAnd
387                }
388                '|' => {
389                    state.advance(1);
390                    PerlTokenType::BitwiseOr
391                }
392                '^' => {
393                    state.advance(1);
394                    PerlTokenType::BitwiseXor
395                }
396                '~' => {
397                    state.advance(1);
398                    PerlTokenType::BitwiseNot
399                }
400                '.' => {
401                    state.advance(1);
402                    if let Some('.') = state.peek() {
403                        state.advance(1);
404                        PerlTokenType::Range
405                    }
406                    else {
407                        PerlTokenType::Concat
408                    }
409                }
410                '?' => {
411                    state.advance(1);
412                    PerlTokenType::Question
413                }
414                ':' => {
415                    state.advance(1);
416                    PerlTokenType::Colon
417                }
418                ';' => {
419                    state.advance(1);
420                    PerlTokenType::Semicolon
421                }
422                ',' => {
423                    state.advance(1);
424                    PerlTokenType::Comma
425                }
426                '(' => {
427                    state.advance(1);
428                    PerlTokenType::LeftParen
429                }
430                ')' => {
431                    state.advance(1);
432                    PerlTokenType::RightParen
433                }
434                '[' => {
435                    state.advance(1);
436                    PerlTokenType::LeftBracket
437                }
438                ']' => {
439                    state.advance(1);
440                    PerlTokenType::RightBracket
441                }
442                '{' => {
443                    state.advance(1);
444                    PerlTokenType::LeftBrace
445                }
446                '}' => {
447                    state.advance(1);
448                    PerlTokenType::RightBrace
449                }
450                '\n' => {
451                    state.advance(1);
452                    PerlTokenType::Newline
453                }
454                _ => {
455                    state.advance(ch.len_utf8());
456                    PerlTokenType::Error
457                }
458            };
459
460            state.add_token(kind, start_pos, state.get_position());
461            true
462        }
463        else {
464            false
465        }
466    }
467}
468
469impl<'config> Lexer<PerlLanguage> for PerlLexer<'config> {
470    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PerlLanguage>) -> LexOutput<PerlLanguage> {
471        let mut state = LexerState::new(source);
472        let result = self.run(&mut state);
473        if result.is_ok() {
474            state.add_eof();
475        }
476        state.finish_with_cache(result, cache)
477    }
478}
479
480impl<'config> PerlLexer<'config> {
481    /// Runs the lexer on the given state until the end of the source.
482    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
483        while state.not_at_end() {
484            let safe_point = state.get_position();
485
486            // Skip whitespace
487            if self.skip_whitespace(state) {
488                continue;
489            }
490
491            // Handle comments
492            if self.skip_comment(state) {
493                continue;
494            }
495
496            // Handle strings
497            if self.lex_string(state) {
498                continue;
499            }
500
501            // Handle variables
502            if self.lex_variable(state) {
503                continue;
504            }
505
506            // Handle identifiers and keywords
507            if self.lex_identifier_or_keyword(state) {
508                continue;
509            }
510
511            // Handle numbers
512            if self.lex_number(state) {
513                continue;
514            }
515
516            // Handle operators and punctuation
517            if self.lex_operators_and_punctuation(state) {
518                continue;
519            }
520
521            // If no pattern matches, create an error token
522            let start_pos = state.get_position();
523            if let Some(ch) = state.peek() {
524                state.advance(ch.len_utf8());
525                state.add_token(PerlTokenType::Error, start_pos, state.get_position())
526            }
527
528            state.advance_if_dead_lock(safe_point)
529        }
530
531        Ok(())
532    }
533}