Skip to main content

oak_php/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions for the PHP lexer.
3pub mod token_type;
4use crate::language::PhpLanguage;
5use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
6pub use token_type::{PhpToken, PhpTokenType};
7
8type State<'s, S> = LexerState<'s, S, PhpLanguage>;
9
10/// Lexer for the PHP language.
11///
12/// This lexer transforms a source string into a stream of [`PhpTokenType`] tokens.
13#[derive(Clone, Debug)]
14pub struct PhpLexer<'config> {
15    config: &'config PhpLanguage,
16}
17
18impl<'config> Lexer<PhpLanguage> for PhpLexer<'config> {
19    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PhpLanguage>) -> LexOutput<PhpLanguage> {
20        let mut state = State::new_with_cache(source, 0, cache);
21        let result = self.run(&mut state);
22        if result.is_ok() {
23            state.add_eof();
24        }
25        state.finish_with_cache(result, cache)
26    }
27}
28
29impl<'config> PhpLexer<'config> {
30    /// Creates a new `PhpLexer` with the given language configuration.
31    pub fn new(config: &'config PhpLanguage) -> Self {
32        Self { config }
33    }
34
35    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
36        while state.not_at_end() {
37            if self.skip_whitespace(state) {
38                continue;
39            }
40
41            if self.lex_newline(state) {
42                continue;
43            }
44
45            if self.lex_tags(state) {
46                continue;
47            }
48
49            if self.lex_comment(state) {
50                continue;
51            }
52
53            if self.lex_string(state) {
54                continue;
55            }
56
57            if self.lex_number(state) {
58                continue;
59            }
60
61            if self.lex_identifier_or_keyword(state) {
62                continue;
63            }
64
65            if self.lex_operators_and_punctuation(state) {
66                continue;
67            }
68
69            // If no rules match, skip the current character
70            if let Some(ch) = state.peek() {
71                let start_pos = state.get_position();
72                state.advance(ch.len_utf8());
73                state.add_token(PhpTokenType::Error, start_pos, state.get_position())
74            }
75            else {
76                // Exit the loop if the end of file is reached
77                break;
78            }
79        }
80
81        Ok(())
82    }
83
84    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
85        let start_pos = state.get_position();
86
87        while let Some(ch) = state.peek() {
88            if ch == ' ' || ch == '\t' {
89                state.advance(ch.len_utf8())
90            }
91            else {
92                break;
93            }
94        }
95
96        if state.get_position() > start_pos {
97            state.add_token(PhpTokenType::Whitespace, start_pos, state.get_position());
98            true
99        }
100        else {
101            false
102        }
103    }
104
105    fn lex_tags<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
106        let start_pos = state.get_position();
107        let rest = state.rest();
108
109        if rest.starts_with(&self.config.tag_start) {
110            state.advance(self.config.tag_start.len());
111            state.add_token(PhpTokenType::OpenTag, start_pos, state.get_position());
112            return true;
113        }
114
115        if rest.starts_with(&self.config.echo_tag_start) {
116            state.advance(self.config.echo_tag_start.len());
117            state.add_token(PhpTokenType::EchoTag, start_pos, state.get_position());
118            return true;
119        }
120
121        if rest.starts_with(&self.config.short_tag_start) {
122            state.advance(self.config.short_tag_start.len());
123            state.add_token(PhpTokenType::OpenTag, start_pos, state.get_position());
124            return true;
125        }
126
127        if rest.starts_with(&self.config.tag_end) {
128            state.advance(self.config.tag_end.len());
129            state.add_token(PhpTokenType::CloseTag, start_pos, state.get_position());
130            return true;
131        }
132
133        false
134    }
135
136    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
137        let start_pos = state.get_position();
138
139        if let Some('\n') = state.peek() {
140            state.advance(1);
141            state.add_token(PhpTokenType::Newline, start_pos, state.get_position());
142            true
143        }
144        else if let Some('\r') = state.peek() {
145            state.advance(1);
146            if let Some('\n') = state.peek() {
147                state.advance(1)
148            }
149            state.add_token(PhpTokenType::Newline, start_pos, state.get_position());
150            true
151        }
152        else {
153            false
154        }
155    }
156
157    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
158        let start_pos = state.get_position();
159
160        if let Some('/') = state.peek() {
161            state.advance(1);
162            if let Some('/') = state.peek() {
163                state.advance(1);
164                // Single-line comment
165                while let Some(ch) = state.peek() {
166                    if ch == '\n' || ch == '\r' {
167                        break;
168                    }
169                    state.advance(ch.len_utf8())
170                }
171                state.add_token(PhpTokenType::Comment, start_pos, state.get_position());
172                return true;
173            }
174            else if let Some('*') = state.peek() {
175                state.advance(1);
176                // Multi-line comment
177                while let Some(ch) = state.peek() {
178                    if ch == '*' {
179                        state.advance(1);
180                        if let Some('/') = state.peek() {
181                            state.advance(1);
182                            break;
183                        }
184                    }
185                    else {
186                        state.advance(ch.len_utf8())
187                    }
188                }
189                state.add_token(PhpTokenType::Comment, start_pos, state.get_position());
190                return true;
191            }
192            else {
193                // Backtrack, this is not a comment
194                state.set_position(start_pos);
195                return false;
196            }
197        }
198        else if let Some('#') = state.peek() {
199            state.advance(1);
200            // PHP-style single-line comment
201            while let Some(ch) = state.peek() {
202                if ch == '\n' || ch == '\r' {
203                    break;
204                }
205                state.advance(ch.len_utf8())
206            }
207            state.add_token(PhpTokenType::Comment, start_pos, state.get_position());
208            true
209        }
210        else {
211            false
212        }
213    }
214
215    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
216        let start_pos = state.get_position();
217
218        if let Some(quote_char) = state.peek() {
219            if quote_char == '"' || quote_char == '\'' {
220                state.advance(1); // Skip starting quote
221
222                let mut escaped = false;
223                while let Some(ch) = state.peek() {
224                    if escaped {
225                        escaped = false;
226                        state.advance(ch.len_utf8())
227                    }
228                    else if ch == '\\' {
229                        escaped = true;
230                        state.advance(1)
231                    }
232                    else if ch == quote_char {
233                        state.advance(1); // Skip ending quote
234                        break;
235                    }
236                    else if ch == '\n' || ch == '\r' {
237                        // Strings cannot span multiple lines (unless escaped)
238                        break;
239                    }
240                    else {
241                        state.advance(ch.len_utf8())
242                    }
243                }
244
245                state.add_token(PhpTokenType::StringLiteral, start_pos, state.get_position());
246                true
247            }
248            else {
249                false
250            }
251        }
252        else {
253            false
254        }
255    }
256
257    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
258        if let Some(ch) = state.peek() {
259            if ch.is_ascii_digit() {
260                let start_pos = state.get_position();
261
262                // Read integer part
263                while let Some(ch) = state.peek() {
264                    if ch.is_ascii_digit() {
265                        state.advance(1)
266                    }
267                    else {
268                        break;
269                    }
270                }
271
272                // Check for decimal point
273                if let Some('.') = state.peek() {
274                    state.advance(1);
275                    // Read fractional part
276                    while let Some(ch) = state.peek() {
277                        if ch.is_ascii_digit() {
278                            state.advance(1)
279                        }
280                        else {
281                            break;
282                        }
283                    }
284                }
285
286                // Check for scientific notation
287                if let Some(ch) = state.peek() {
288                    if ch == 'e' || ch == 'E' {
289                        state.advance(1);
290                        if let Some(ch) = state.peek() {
291                            if ch == '+' || ch == '-' {
292                                state.advance(1)
293                            }
294                        }
295                        while let Some(ch) = state.peek() {
296                            if ch.is_ascii_digit() {
297                                state.advance(1)
298                            }
299                            else {
300                                break;
301                            }
302                        }
303                    }
304                }
305
306                state.add_token(PhpTokenType::NumberLiteral, start_pos, state.get_position());
307                true
308            }
309            else {
310                false
311            }
312        }
313        else {
314            false
315        }
316    }
317
318    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
319        if let Some(ch) = state.peek() {
320            if ch.is_alphabetic() || ch == '_' || ch == '$' {
321                let start_pos = state.get_position();
322
323                // Read identifier
324                while let Some(ch) = state.peek() {
325                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
326                        state.advance(ch.len_utf8())
327                    }
328                    else {
329                        break;
330                    }
331                }
332
333                let end_pos = state.get_position();
334                let text = state.source().get_text_in(oak_core::Range { start: start_pos, end: end_pos });
335
336                // Check if it is a keyword
337                let kind = match text.as_ref() {
338                    "abstract" => PhpTokenType::Abstract,
339                    "and" => PhpTokenType::And,
340                    "array" => PhpTokenType::Array,
341                    "as" => PhpTokenType::As,
342                    "break" => PhpTokenType::Break,
343                    "callable" => PhpTokenType::Callable,
344                    "case" => PhpTokenType::Case,
345                    "catch" => PhpTokenType::Catch,
346                    "class" => PhpTokenType::Class,
347                    "clone" => PhpTokenType::Clone,
348                    "const" => PhpTokenType::Const,
349                    "continue" => PhpTokenType::Continue,
350                    "declare" => PhpTokenType::Declare,
351                    "default" => PhpTokenType::Default,
352                    "die" => PhpTokenType::Exit,
353                    "do" => PhpTokenType::Do,
354                    "echo" => PhpTokenType::Echo,
355                    "else" => PhpTokenType::Else,
356                    "elseif" => PhpTokenType::Elseif,
357                    "empty" => PhpTokenType::Empty,
358                    "enddeclare" => PhpTokenType::Enddeclare,
359                    "endfor" => PhpTokenType::Endfor,
360                    "endforeach" => PhpTokenType::Endforeach,
361                    "endif" => PhpTokenType::Endif,
362                    "endswitch" => PhpTokenType::Endswitch,
363                    "endwhile" => PhpTokenType::Endwhile,
364                    "eval" => PhpTokenType::Eval,
365                    "exit" => PhpTokenType::Exit,
366                    "extends" => PhpTokenType::Extends,
367                    "final" => PhpTokenType::Final,
368                    "finally" => PhpTokenType::Finally,
369                    "for" => PhpTokenType::For,
370                    "foreach" => PhpTokenType::Foreach,
371                    "function" => PhpTokenType::Function,
372                    "global" => PhpTokenType::Global,
373                    "goto" => PhpTokenType::Goto,
374                    "if" => PhpTokenType::If,
375                    "implements" => PhpTokenType::Implements,
376                    "include" => PhpTokenType::Include,
377                    "include_once" => PhpTokenType::IncludeOnce,
378                    "instanceof" => PhpTokenType::Instanceof,
379                    "insteadof" => PhpTokenType::Insteadof,
380                    "interface" => PhpTokenType::Interface,
381                    "isset" => PhpTokenType::Isset,
382                    "list" => PhpTokenType::List,
383                    "namespace" => PhpTokenType::Namespace,
384                    "new" => PhpTokenType::New,
385                    "or" => PhpTokenType::Or,
386                    "print" => PhpTokenType::Print,
387                    "private" => PhpTokenType::Private,
388                    "protected" => PhpTokenType::Protected,
389                    "public" => PhpTokenType::Public,
390                    "require" => PhpTokenType::Require,
391                    "require_once" => PhpTokenType::RequireOnce,
392                    "return" => PhpTokenType::Return,
393                    "static" => PhpTokenType::Static,
394                    "switch" => PhpTokenType::Switch,
395                    "throw" => PhpTokenType::Throw,
396                    "trait" => PhpTokenType::Trait,
397                    "try" => PhpTokenType::Try,
398                    "unset" => PhpTokenType::Unset,
399                    "use" => PhpTokenType::Use,
400                    "var" => PhpTokenType::Var,
401                    "while" => PhpTokenType::While,
402                    "xor" => PhpTokenType::Xor,
403                    "yield" => PhpTokenType::Yield,
404                    "true" => PhpTokenType::BooleanLiteral,
405                    "false" => PhpTokenType::BooleanLiteral,
406                    "null" => PhpTokenType::NullLiteral,
407                    _ => PhpTokenType::Identifier,
408                };
409
410                state.add_token(kind, start_pos, state.get_position());
411                true
412            }
413            else {
414                false
415            }
416        }
417        else {
418            false
419        }
420    }
421
422    fn lex_operators_and_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
423        if let Some(ch) = state.peek() {
424            let start_pos = state.get_position();
425
426            let kind = match ch {
427                '+' => {
428                    state.advance(1);
429                    if let Some('+') = state.peek() {
430                        state.advance(1);
431                        PhpTokenType::Increment
432                    }
433                    else if let Some('=') = state.peek() {
434                        state.advance(1);
435                        PhpTokenType::PlusAssign
436                    }
437                    else {
438                        PhpTokenType::Plus
439                    }
440                }
441                '-' => {
442                    state.advance(1);
443                    if let Some('-') = state.peek() {
444                        state.advance(1);
445                        PhpTokenType::Decrement
446                    }
447                    else if let Some('=') = state.peek() {
448                        state.advance(1);
449                        PhpTokenType::MinusAssign
450                    }
451                    else if let Some('>') = state.peek() {
452                        state.advance(1);
453                        PhpTokenType::Arrow
454                    }
455                    else {
456                        PhpTokenType::Minus
457                    }
458                }
459                '*' => {
460                    state.advance(1);
461                    if let Some('*') = state.peek() {
462                        state.advance(1);
463                        PhpTokenType::Power
464                    }
465                    else if let Some('=') = state.peek() {
466                        state.advance(1);
467                        PhpTokenType::MultiplyAssign
468                    }
469                    else {
470                        PhpTokenType::Multiply
471                    }
472                }
473                '/' => {
474                    state.advance(1);
475                    if let Some('=') = state.peek() {
476                        state.advance(1);
477                        PhpTokenType::DivideAssign
478                    }
479                    else {
480                        PhpTokenType::Divide
481                    }
482                }
483                '%' => {
484                    state.advance(1);
485                    if let Some('=') = state.peek() {
486                        state.advance(1);
487                        PhpTokenType::ModuloAssign
488                    }
489                    else {
490                        PhpTokenType::Modulo
491                    }
492                }
493                '=' => {
494                    state.advance(1);
495                    if let Some('=') = state.peek() {
496                        state.advance(1);
497                        if let Some('=') = state.peek() {
498                            state.advance(1);
499                            PhpTokenType::Identical
500                        }
501                        else {
502                            PhpTokenType::Equal
503                        }
504                    }
505                    else if let Some('>') = state.peek() {
506                        state.advance(1);
507                        PhpTokenType::DoubleArrow
508                    }
509                    else {
510                        PhpTokenType::Assign
511                    }
512                }
513                '!' => {
514                    state.advance(1);
515                    if let Some('=') = state.peek() {
516                        state.advance(1);
517                        if let Some('=') = state.peek() {
518                            state.advance(1);
519                            PhpTokenType::NotIdentical
520                        }
521                        else {
522                            PhpTokenType::NotEqual
523                        }
524                    }
525                    else {
526                        PhpTokenType::LogicalNot
527                    }
528                }
529                '<' => {
530                    state.advance(1);
531                    if let Some('=') = state.peek() {
532                        state.advance(1);
533                        PhpTokenType::LessEqual
534                    }
535                    else if let Some('<') = state.peek() {
536                        state.advance(1);
537                        if let Some('=') = state.peek() {
538                            state.advance(1);
539                            PhpTokenType::LeftShiftAssign
540                        }
541                        else {
542                            PhpTokenType::LeftShift
543                        }
544                    }
545                    else if let Some('>') = state.peek() {
546                        state.advance(1);
547                        PhpTokenType::Spaceship
548                    }
549                    else {
550                        PhpTokenType::Less
551                    }
552                }
553                '>' => {
554                    state.advance(1);
555                    if let Some('=') = state.peek() {
556                        state.advance(1);
557                        PhpTokenType::GreaterEqual
558                    }
559                    else if let Some('>') = state.peek() {
560                        state.advance(1);
561                        if let Some('=') = state.peek() {
562                            state.advance(1);
563                            PhpTokenType::RightShiftAssign
564                        }
565                        else {
566                            PhpTokenType::RightShift
567                        }
568                    }
569                    else {
570                        PhpTokenType::Greater
571                    }
572                }
573                '&' => {
574                    state.advance(1);
575                    if let Some('&') = state.peek() {
576                        state.advance(1);
577                        PhpTokenType::LogicalAnd
578                    }
579                    else if let Some('=') = state.peek() {
580                        state.advance(1);
581                        PhpTokenType::BitwiseAndAssign
582                    }
583                    else {
584                        PhpTokenType::BitwiseAnd
585                    }
586                }
587                '|' => {
588                    state.advance(1);
589                    if let Some('|') = state.peek() {
590                        state.advance(1);
591                        PhpTokenType::LogicalOr
592                    }
593                    else if let Some('=') = state.peek() {
594                        state.advance(1);
595                        PhpTokenType::BitwiseOrAssign
596                    }
597                    else {
598                        PhpTokenType::BitwiseOr
599                    }
600                }
601                '^' => {
602                    state.advance(1);
603                    if let Some('=') = state.peek() {
604                        state.advance(1);
605                        PhpTokenType::BitwiseXorAssign
606                    }
607                    else {
608                        PhpTokenType::BitwiseXor
609                    }
610                }
611                '~' => {
612                    state.advance(1);
613                    PhpTokenType::BitwiseNot
614                }
615                '?' => {
616                    state.advance(1);
617                    if let Some('?') = state.peek() {
618                        state.advance(1);
619                        PhpTokenType::NullCoalesce
620                    }
621                    else {
622                        PhpTokenType::Question
623                    }
624                }
625                ':' => {
626                    state.advance(1);
627                    if let Some(':') = state.peek() {
628                        state.advance(1);
629                        PhpTokenType::DoubleColon
630                    }
631                    else {
632                        PhpTokenType::Colon
633                    }
634                }
635                ';' => {
636                    state.advance(1);
637                    PhpTokenType::Semicolon
638                }
639                ',' => {
640                    state.advance(1);
641                    PhpTokenType::Comma
642                }
643                '.' => {
644                    state.advance(1);
645                    if let Some('=') = state.peek() {
646                        state.advance(1);
647                        PhpTokenType::ConcatAssign
648                    }
649                    else {
650                        PhpTokenType::Dot
651                    }
652                }
653                '(' => {
654                    state.advance(1);
655                    PhpTokenType::LeftParen
656                }
657                ')' => {
658                    state.advance(1);
659                    PhpTokenType::RightParen
660                }
661                '[' => {
662                    state.advance(1);
663                    PhpTokenType::LeftBracket
664                }
665                ']' => {
666                    state.advance(1);
667                    PhpTokenType::RightBracket
668                }
669                '{' => {
670                    state.advance(1);
671                    PhpTokenType::LeftBrace
672                }
673                '}' => {
674                    state.advance(1);
675                    PhpTokenType::RightBrace
676                }
677                '$' => {
678                    state.advance(1);
679                    PhpTokenType::Dollar
680                }
681                '@' => {
682                    state.advance(1);
683                    PhpTokenType::At
684                }
685                _ => return false,
686            };
687
688            state.add_token(kind, start_pos, state.get_position());
689            true
690        }
691        else {
692            false
693        }
694    }
695}