Skip to main content

oak_php/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3use crate::language::PhpLanguage;
4use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
5pub use token_type::{PhpToken, PhpTokenType};
6
7type State<'s, S> = LexerState<'s, S, PhpLanguage>;
8
9/// Lexer for the PHP language.
10///
11/// This lexer transforms a source string into a stream of [`PhpTokenType`] tokens.
12#[derive(Clone, Debug)]
13pub struct PhpLexer<'config> {
14    _config: &'config PhpLanguage,
15}
16
17impl<'config> Lexer<PhpLanguage> for PhpLexer<'config> {
18    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PhpLanguage>) -> LexOutput<PhpLanguage> {
19        let mut state = State::new_with_cache(source, 0, cache);
20        let result = self.run(&mut state);
21        if result.is_ok() {
22            state.add_eof();
23        }
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> PhpLexer<'config> {
29    /// Creates a new `PhpLexer` with the given language configuration.
30    pub fn new(config: &'config PhpLanguage) -> Self {
31        Self { _config: config }
32    }
33
34    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
35        while state.not_at_end() {
36            if self.skip_whitespace(state) {
37                continue;
38            }
39
40            if self.lex_newline(state) {
41                continue;
42            }
43
44            if self.lex_comment(state) {
45                continue;
46            }
47
48            if self.lex_string(state) {
49                continue;
50            }
51
52            if self.lex_number(state) {
53                continue;
54            }
55
56            if self.lex_identifier_or_keyword(state) {
57                continue;
58            }
59
60            if self.lex_operators_and_punctuation(state) {
61                continue;
62            }
63
64            // 如果没有匹配任何规则,跳过当前字符
65            if let Some(ch) = state.peek() {
66                let start_pos = state.get_position();
67                state.advance(ch.len_utf8());
68                state.add_token(PhpTokenType::Error, start_pos, state.get_position())
69            }
70            else {
71                // 如果已到达文件末尾,退出循环
72                break;
73            }
74        }
75
76        Ok(())
77    }
78
79    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
80        let start_pos = state.get_position();
81
82        while let Some(ch) = state.peek() {
83            if ch == ' ' || ch == '\t' {
84                state.advance(ch.len_utf8())
85            }
86            else {
87                break;
88            }
89        }
90
91        if state.get_position() > start_pos {
92            state.add_token(PhpTokenType::Whitespace, start_pos, state.get_position());
93            true
94        }
95        else {
96            false
97        }
98    }
99
100    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
101        let start_pos = state.get_position();
102
103        if let Some('\n') = state.peek() {
104            state.advance(1);
105            state.add_token(PhpTokenType::Newline, start_pos, state.get_position());
106            true
107        }
108        else if let Some('\r') = state.peek() {
109            state.advance(1);
110            if let Some('\n') = state.peek() {
111                state.advance(1)
112            }
113            state.add_token(PhpTokenType::Newline, start_pos, state.get_position());
114            true
115        }
116        else {
117            false
118        }
119    }
120
121    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
122        let start_pos = state.get_position();
123
124        if let Some('/') = state.peek() {
125            state.advance(1);
126            if let Some('/') = state.peek() {
127                state.advance(1);
128                // 单行注释
129                while let Some(ch) = state.peek() {
130                    if ch == '\n' || ch == '\r' {
131                        break;
132                    }
133                    state.advance(ch.len_utf8())
134                }
135                state.add_token(PhpTokenType::Comment, start_pos, state.get_position());
136                return true;
137            }
138            else if let Some('*') = state.peek() {
139                state.advance(1);
140                // 多行注释
141                while let Some(ch) = state.peek() {
142                    if ch == '*' {
143                        state.advance(1);
144                        if let Some('/') = state.peek() {
145                            state.advance(1);
146                            break;
147                        }
148                    }
149                    else {
150                        state.advance(ch.len_utf8())
151                    }
152                }
153                state.add_token(PhpTokenType::Comment, start_pos, state.get_position());
154                return true;
155            }
156            else {
157                // 回退,这不是注释
158                state.set_position(start_pos);
159                return false;
160            }
161        }
162        else if let Some('#') = state.peek() {
163            state.advance(1);
164            // PHP 风格的单行注释
165            while let Some(ch) = state.peek() {
166                if ch == '\n' || ch == '\r' {
167                    break;
168                }
169                state.advance(ch.len_utf8())
170            }
171            state.add_token(PhpTokenType::Comment, start_pos, state.get_position());
172            true
173        }
174        else {
175            false
176        }
177    }
178
179    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
180        let start_pos = state.get_position();
181
182        if let Some(quote_char) = state.peek() {
183            if quote_char == '"' || quote_char == '\'' {
184                state.advance(1); // 跳过开始引号
185
186                let mut escaped = false;
187                while let Some(ch) = state.peek() {
188                    if escaped {
189                        escaped = false;
190                        state.advance(ch.len_utf8())
191                    }
192                    else if ch == '\\' {
193                        escaped = true;
194                        state.advance(1)
195                    }
196                    else if ch == quote_char {
197                        state.advance(1); // 跳过结束引号
198                        break;
199                    }
200                    else if ch == '\n' || ch == '\r' {
201                        // 字符串不能跨行(除非转义)
202                        break;
203                    }
204                    else {
205                        state.advance(ch.len_utf8())
206                    }
207                }
208
209                state.add_token(PhpTokenType::StringLiteral, start_pos, state.get_position());
210                true
211            }
212            else {
213                false
214            }
215        }
216        else {
217            false
218        }
219    }
220
221    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
222        if let Some(ch) = state.peek() {
223            if ch.is_ascii_digit() {
224                let start_pos = state.get_position();
225
226                // 读取整数部分
227                while let Some(ch) = state.peek() {
228                    if ch.is_ascii_digit() {
229                        state.advance(1)
230                    }
231                    else {
232                        break;
233                    }
234                }
235
236                // 检查小数点
237                if let Some('.') = state.peek() {
238                    state.advance(1);
239                    // 读取小数部分
240                    while let Some(ch) = state.peek() {
241                        if ch.is_ascii_digit() {
242                            state.advance(1)
243                        }
244                        else {
245                            break;
246                        }
247                    }
248                }
249
250                // 检查科学记数法
251                if let Some(ch) = state.peek() {
252                    if ch == 'e' || ch == 'E' {
253                        state.advance(1);
254                        if let Some(ch) = state.peek() {
255                            if ch == '+' || ch == '-' {
256                                state.advance(1)
257                            }
258                        }
259                        while let Some(ch) = state.peek() {
260                            if ch.is_ascii_digit() {
261                                state.advance(1)
262                            }
263                            else {
264                                break;
265                            }
266                        }
267                    }
268                }
269
270                state.add_token(PhpTokenType::NumberLiteral, start_pos, state.get_position());
271                true
272            }
273            else {
274                false
275            }
276        }
277        else {
278            false
279        }
280    }
281
282    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
283        if let Some(ch) = state.peek() {
284            if ch.is_alphabetic() || ch == '_' || ch == '$' {
285                let start_pos = state.get_position();
286
287                // 读取标识符
288                while let Some(ch) = state.peek() {
289                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
290                        state.advance(ch.len_utf8())
291                    }
292                    else {
293                        break;
294                    }
295                }
296
297                let end_pos = state.get_position();
298                let text = state.source().get_text_in(oak_core::Range { start: start_pos, end: end_pos });
299
300                // 检查是否是关键字
301                let kind = match text.as_ref() {
302                    "abstract" => PhpTokenType::Abstract,
303                    "and" => PhpTokenType::And,
304                    "array" => PhpTokenType::Array,
305                    "as" => PhpTokenType::As,
306                    "break" => PhpTokenType::Break,
307                    "callable" => PhpTokenType::Callable,
308                    "case" => PhpTokenType::Case,
309                    "catch" => PhpTokenType::Catch,
310                    "class" => PhpTokenType::Class,
311                    "clone" => PhpTokenType::Clone,
312                    "const" => PhpTokenType::Const,
313                    "continue" => PhpTokenType::Continue,
314                    "declare" => PhpTokenType::Declare,
315                    "default" => PhpTokenType::Default,
316                    "die" => PhpTokenType::Exit,
317                    "do" => PhpTokenType::Do,
318                    "echo" => PhpTokenType::Echo,
319                    "else" => PhpTokenType::Else,
320                    "elseif" => PhpTokenType::Elseif,
321                    "empty" => PhpTokenType::Empty,
322                    "enddeclare" => PhpTokenType::Enddeclare,
323                    "endfor" => PhpTokenType::Endfor,
324                    "endforeach" => PhpTokenType::Endforeach,
325                    "endif" => PhpTokenType::Endif,
326                    "endswitch" => PhpTokenType::Endswitch,
327                    "endwhile" => PhpTokenType::Endwhile,
328                    "eval" => PhpTokenType::Eval,
329                    "exit" => PhpTokenType::Exit,
330                    "extends" => PhpTokenType::Extends,
331                    "final" => PhpTokenType::Final,
332                    "finally" => PhpTokenType::Finally,
333                    "for" => PhpTokenType::For,
334                    "foreach" => PhpTokenType::Foreach,
335                    "function" => PhpTokenType::Function,
336                    "global" => PhpTokenType::Global,
337                    "goto" => PhpTokenType::Goto,
338                    "if" => PhpTokenType::If,
339                    "implements" => PhpTokenType::Implements,
340                    "include" => PhpTokenType::Include,
341                    "include_once" => PhpTokenType::IncludeOnce,
342                    "instanceof" => PhpTokenType::Instanceof,
343                    "insteadof" => PhpTokenType::Insteadof,
344                    "interface" => PhpTokenType::Interface,
345                    "isset" => PhpTokenType::Isset,
346                    "list" => PhpTokenType::List,
347                    "namespace" => PhpTokenType::Namespace,
348                    "new" => PhpTokenType::New,
349                    "or" => PhpTokenType::Or,
350                    "print" => PhpTokenType::Print,
351                    "private" => PhpTokenType::Private,
352                    "protected" => PhpTokenType::Protected,
353                    "public" => PhpTokenType::Public,
354                    "require" => PhpTokenType::Require,
355                    "require_once" => PhpTokenType::RequireOnce,
356                    "return" => PhpTokenType::Return,
357                    "static" => PhpTokenType::Static,
358                    "switch" => PhpTokenType::Switch,
359                    "throw" => PhpTokenType::Throw,
360                    "trait" => PhpTokenType::Trait,
361                    "try" => PhpTokenType::Try,
362                    "unset" => PhpTokenType::Unset,
363                    "use" => PhpTokenType::Use,
364                    "var" => PhpTokenType::Var,
365                    "while" => PhpTokenType::While,
366                    "xor" => PhpTokenType::Xor,
367                    "yield" => PhpTokenType::Yield,
368                    "true" => PhpTokenType::BooleanLiteral,
369                    "false" => PhpTokenType::BooleanLiteral,
370                    "null" => PhpTokenType::NullLiteral,
371                    _ => PhpTokenType::Identifier,
372                };
373
374                state.add_token(kind, start_pos, state.get_position());
375                true
376            }
377            else {
378                false
379            }
380        }
381        else {
382            false
383        }
384    }
385
386    fn lex_operators_and_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
387        if let Some(ch) = state.peek() {
388            let start_pos = state.get_position();
389
390            let kind = match ch {
391                '+' => {
392                    state.advance(1);
393                    if let Some('+') = state.peek() {
394                        state.advance(1);
395                        PhpTokenType::Increment
396                    }
397                    else if let Some('=') = state.peek() {
398                        state.advance(1);
399                        PhpTokenType::PlusAssign
400                    }
401                    else {
402                        PhpTokenType::Plus
403                    }
404                }
405                '-' => {
406                    state.advance(1);
407                    if let Some('-') = state.peek() {
408                        state.advance(1);
409                        PhpTokenType::Decrement
410                    }
411                    else if let Some('=') = state.peek() {
412                        state.advance(1);
413                        PhpTokenType::MinusAssign
414                    }
415                    else if let Some('>') = state.peek() {
416                        state.advance(1);
417                        PhpTokenType::Arrow
418                    }
419                    else {
420                        PhpTokenType::Minus
421                    }
422                }
423                '*' => {
424                    state.advance(1);
425                    if let Some('*') = state.peek() {
426                        state.advance(1);
427                        PhpTokenType::Power
428                    }
429                    else if let Some('=') = state.peek() {
430                        state.advance(1);
431                        PhpTokenType::MultiplyAssign
432                    }
433                    else {
434                        PhpTokenType::Multiply
435                    }
436                }
437                '/' => {
438                    state.advance(1);
439                    if let Some('=') = state.peek() {
440                        state.advance(1);
441                        PhpTokenType::DivideAssign
442                    }
443                    else {
444                        PhpTokenType::Divide
445                    }
446                }
447                '%' => {
448                    state.advance(1);
449                    if let Some('=') = state.peek() {
450                        state.advance(1);
451                        PhpTokenType::ModuloAssign
452                    }
453                    else {
454                        PhpTokenType::Modulo
455                    }
456                }
457                '=' => {
458                    state.advance(1);
459                    if let Some('=') = state.peek() {
460                        state.advance(1);
461                        if let Some('=') = state.peek() {
462                            state.advance(1);
463                            PhpTokenType::Identical
464                        }
465                        else {
466                            PhpTokenType::Equal
467                        }
468                    }
469                    else if let Some('>') = state.peek() {
470                        state.advance(1);
471                        PhpTokenType::DoubleArrow
472                    }
473                    else {
474                        PhpTokenType::Assign
475                    }
476                }
477                '!' => {
478                    state.advance(1);
479                    if let Some('=') = state.peek() {
480                        state.advance(1);
481                        if let Some('=') = state.peek() {
482                            state.advance(1);
483                            PhpTokenType::NotIdentical
484                        }
485                        else {
486                            PhpTokenType::NotEqual
487                        }
488                    }
489                    else {
490                        PhpTokenType::LogicalNot
491                    }
492                }
493                '<' => {
494                    state.advance(1);
495                    if let Some('=') = state.peek() {
496                        state.advance(1);
497                        PhpTokenType::LessEqual
498                    }
499                    else if let Some('<') = state.peek() {
500                        state.advance(1);
501                        if let Some('=') = state.peek() {
502                            state.advance(1);
503                            PhpTokenType::LeftShiftAssign
504                        }
505                        else {
506                            PhpTokenType::LeftShift
507                        }
508                    }
509                    else if let Some('>') = state.peek() {
510                        state.advance(1);
511                        PhpTokenType::Spaceship
512                    }
513                    else {
514                        PhpTokenType::Less
515                    }
516                }
517                '>' => {
518                    state.advance(1);
519                    if let Some('=') = state.peek() {
520                        state.advance(1);
521                        PhpTokenType::GreaterEqual
522                    }
523                    else if let Some('>') = state.peek() {
524                        state.advance(1);
525                        if let Some('=') = state.peek() {
526                            state.advance(1);
527                            PhpTokenType::RightShiftAssign
528                        }
529                        else {
530                            PhpTokenType::RightShift
531                        }
532                    }
533                    else {
534                        PhpTokenType::Greater
535                    }
536                }
537                '&' => {
538                    state.advance(1);
539                    if let Some('&') = state.peek() {
540                        state.advance(1);
541                        PhpTokenType::LogicalAnd
542                    }
543                    else if let Some('=') = state.peek() {
544                        state.advance(1);
545                        PhpTokenType::BitwiseAndAssign
546                    }
547                    else {
548                        PhpTokenType::BitwiseAnd
549                    }
550                }
551                '|' => {
552                    state.advance(1);
553                    if let Some('|') = state.peek() {
554                        state.advance(1);
555                        PhpTokenType::LogicalOr
556                    }
557                    else if let Some('=') = state.peek() {
558                        state.advance(1);
559                        PhpTokenType::BitwiseOrAssign
560                    }
561                    else {
562                        PhpTokenType::BitwiseOr
563                    }
564                }
565                '^' => {
566                    state.advance(1);
567                    if let Some('=') = state.peek() {
568                        state.advance(1);
569                        PhpTokenType::BitwiseXorAssign
570                    }
571                    else {
572                        PhpTokenType::BitwiseXor
573                    }
574                }
575                '~' => {
576                    state.advance(1);
577                    PhpTokenType::BitwiseNot
578                }
579                '?' => {
580                    state.advance(1);
581                    if let Some('?') = state.peek() {
582                        state.advance(1);
583                        PhpTokenType::NullCoalesce
584                    }
585                    else {
586                        PhpTokenType::Question
587                    }
588                }
589                ':' => {
590                    state.advance(1);
591                    if let Some(':') = state.peek() {
592                        state.advance(1);
593                        PhpTokenType::DoubleColon
594                    }
595                    else {
596                        PhpTokenType::Colon
597                    }
598                }
599                ';' => {
600                    state.advance(1);
601                    PhpTokenType::Semicolon
602                }
603                ',' => {
604                    state.advance(1);
605                    PhpTokenType::Comma
606                }
607                '.' => {
608                    state.advance(1);
609                    if let Some('=') = state.peek() {
610                        state.advance(1);
611                        PhpTokenType::ConcatAssign
612                    }
613                    else {
614                        PhpTokenType::Dot
615                    }
616                }
617                '(' => {
618                    state.advance(1);
619                    PhpTokenType::LeftParen
620                }
621                ')' => {
622                    state.advance(1);
623                    PhpTokenType::RightParen
624                }
625                '[' => {
626                    state.advance(1);
627                    PhpTokenType::LeftBracket
628                }
629                ']' => {
630                    state.advance(1);
631                    PhpTokenType::RightBracket
632                }
633                '{' => {
634                    state.advance(1);
635                    PhpTokenType::LeftBrace
636                }
637                '}' => {
638                    state.advance(1);
639                    PhpTokenType::RightBrace
640                }
641                '$' => {
642                    state.advance(1);
643                    PhpTokenType::Dollar
644                }
645                '@' => {
646                    state.advance(1);
647                    PhpTokenType::At
648                }
649                _ => return false,
650            };
651
652            state.add_token(kind, start_pos, state.get_position());
653            true
654        }
655        else {
656            false
657        }
658    }
659}