Skip to main content

oak_php/lexer/
mod.rs

1use crate::{kind::PhpSyntaxKind, language::PhpLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<'s, S> = LexerState<'s, S, PhpLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct PhpLexer<'config> {
8    _config: &'config PhpLanguage,
9}
10
11impl<'config> Lexer<PhpLanguage> for PhpLexer<'config> {
12    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PhpLanguage>) -> LexOutput<PhpLanguage> {
13        let mut state = State::new_with_cache(source, 0, cache);
14        let result = self.run(&mut state);
15        if result.is_ok() {
16            state.add_eof();
17        }
18        state.finish_with_cache(result, cache)
19    }
20}
21
22impl<'config> PhpLexer<'config> {
23    pub fn new(config: &'config PhpLanguage) -> Self {
24        Self { _config: config }
25    }
26
27    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
28        while state.not_at_end() {
29            if self.skip_whitespace(state) {
30                continue;
31            }
32
33            if self.lex_newline(state) {
34                continue;
35            }
36
37            if self.lex_comment(state) {
38                continue;
39            }
40
41            if self.lex_string(state) {
42                continue;
43            }
44
45            if self.lex_number(state) {
46                continue;
47            }
48
49            if self.lex_identifier_or_keyword(state) {
50                continue;
51            }
52
53            if self.lex_operators_and_punctuation(state) {
54                continue;
55            }
56
57            // 如果没有匹配任何规则,跳过当前字符
58            if let Some(ch) = state.peek() {
59                let start_pos = state.get_position();
60                state.advance(ch.len_utf8());
61                state.add_token(PhpSyntaxKind::Error, start_pos, state.get_position());
62            }
63            else {
64                // 如果已到达文件末尾,退出循环
65                break;
66            }
67        }
68
69        Ok(())
70    }
71
72    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
73        let start_pos = state.get_position();
74
75        while let Some(ch) = state.peek() {
76            if ch == ' ' || ch == '\t' {
77                state.advance(ch.len_utf8());
78            }
79            else {
80                break;
81            }
82        }
83
84        if state.get_position() > start_pos {
85            state.add_token(PhpSyntaxKind::Whitespace, start_pos, state.get_position());
86            true
87        }
88        else {
89            false
90        }
91    }
92
93    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
94        let start_pos = state.get_position();
95
96        if let Some('\n') = state.peek() {
97            state.advance(1);
98            state.add_token(PhpSyntaxKind::Newline, start_pos, state.get_position());
99            true
100        }
101        else if let Some('\r') = state.peek() {
102            state.advance(1);
103            if let Some('\n') = state.peek() {
104                state.advance(1);
105            }
106            state.add_token(PhpSyntaxKind::Newline, start_pos, state.get_position());
107            true
108        }
109        else {
110            false
111        }
112    }
113
114    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
115        let start_pos = state.get_position();
116
117        if let Some('/') = state.peek() {
118            state.advance(1);
119            if let Some('/') = state.peek() {
120                state.advance(1);
121                // 单行注释
122                while let Some(ch) = state.peek() {
123                    if ch == '\n' || ch == '\r' {
124                        break;
125                    }
126                    state.advance(ch.len_utf8());
127                }
128                state.add_token(PhpSyntaxKind::Comment, start_pos, state.get_position());
129                return true;
130            }
131            else if let Some('*') = state.peek() {
132                state.advance(1);
133                // 多行注释
134                while let Some(ch) = state.peek() {
135                    if ch == '*' {
136                        state.advance(1);
137                        if let Some('/') = state.peek() {
138                            state.advance(1);
139                            break;
140                        }
141                    }
142                    else {
143                        state.advance(ch.len_utf8());
144                    }
145                }
146                state.add_token(PhpSyntaxKind::Comment, start_pos, state.get_position());
147                return true;
148            }
149            else {
150                // 回退,这不是注释
151                state.set_position(start_pos);
152                return false;
153            }
154        }
155        else if let Some('#') = state.peek() {
156            state.advance(1);
157            // PHP 风格的单行注释
158            while let Some(ch) = state.peek() {
159                if ch == '\n' || ch == '\r' {
160                    break;
161                }
162                state.advance(ch.len_utf8());
163            }
164            state.add_token(PhpSyntaxKind::Comment, start_pos, state.get_position());
165            true
166        }
167        else {
168            false
169        }
170    }
171
172    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
173        let start_pos = state.get_position();
174
175        if let Some(quote_char) = state.peek() {
176            if quote_char == '"' || quote_char == '\'' {
177                state.advance(1); // 跳过开始引号
178
179                let mut escaped = false;
180                while let Some(ch) = state.peek() {
181                    if escaped {
182                        escaped = false;
183                        state.advance(ch.len_utf8());
184                    }
185                    else if ch == '\\' {
186                        escaped = true;
187                        state.advance(1);
188                    }
189                    else if ch == quote_char {
190                        state.advance(1); // 跳过结束引号
191                        break;
192                    }
193                    else if ch == '\n' || ch == '\r' {
194                        // 字符串不能跨行(除非转义)
195                        break;
196                    }
197                    else {
198                        state.advance(ch.len_utf8());
199                    }
200                }
201
202                state.add_token(PhpSyntaxKind::StringLiteral, start_pos, state.get_position());
203                true
204            }
205            else {
206                false
207            }
208        }
209        else {
210            false
211        }
212    }
213
214    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
215        if let Some(ch) = state.peek() {
216            if ch.is_ascii_digit() {
217                let start_pos = state.get_position();
218
219                // 读取整数部分
220                while let Some(ch) = state.peek() {
221                    if ch.is_ascii_digit() {
222                        state.advance(1);
223                    }
224                    else {
225                        break;
226                    }
227                }
228
229                // 检查小数点
230                if let Some('.') = state.peek() {
231                    state.advance(1);
232                    // 读取小数部分
233                    while let Some(ch) = state.peek() {
234                        if ch.is_ascii_digit() {
235                            state.advance(1);
236                        }
237                        else {
238                            break;
239                        }
240                    }
241                }
242
243                // 检查科学记数法
244                if let Some(ch) = state.peek() {
245                    if ch == 'e' || ch == 'E' {
246                        state.advance(1);
247                        if let Some(ch) = state.peek() {
248                            if ch == '+' || ch == '-' {
249                                state.advance(1);
250                            }
251                        }
252                        while let Some(ch) = state.peek() {
253                            if ch.is_ascii_digit() {
254                                state.advance(1);
255                            }
256                            else {
257                                break;
258                            }
259                        }
260                    }
261                }
262
263                state.add_token(PhpSyntaxKind::NumberLiteral, start_pos, state.get_position());
264                true
265            }
266            else {
267                false
268            }
269        }
270        else {
271            false
272        }
273    }
274
275    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
276        if let Some(ch) = state.peek() {
277            if ch.is_alphabetic() || ch == '_' || ch == '$' {
278                let start_pos = state.get_position();
279
280                // 读取标识符
281                while let Some(ch) = state.peek() {
282                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
283                        state.advance(ch.len_utf8());
284                    }
285                    else {
286                        break;
287                    }
288                }
289
290                let end_pos = state.get_position();
291                let text = state.source().get_text_in(oak_core::Range { start: start_pos, end: end_pos });
292
293                // 检查是否是关键字
294                let kind = match text.as_ref() {
295                    "abstract" => PhpSyntaxKind::Abstract,
296                    "and" => PhpSyntaxKind::And,
297                    "array" => PhpSyntaxKind::Array,
298                    "as" => PhpSyntaxKind::As,
299                    "break" => PhpSyntaxKind::Break,
300                    "callable" => PhpSyntaxKind::Callable,
301                    "case" => PhpSyntaxKind::Case,
302                    "catch" => PhpSyntaxKind::Catch,
303                    "class" => PhpSyntaxKind::Class,
304                    "clone" => PhpSyntaxKind::Clone,
305                    "const" => PhpSyntaxKind::Const,
306                    "continue" => PhpSyntaxKind::Continue,
307                    "declare" => PhpSyntaxKind::Declare,
308                    "default" => PhpSyntaxKind::Default,
309                    "die" => PhpSyntaxKind::Exit,
310                    "do" => PhpSyntaxKind::Do,
311                    "echo" => PhpSyntaxKind::Echo,
312                    "else" => PhpSyntaxKind::Else,
313                    "elseif" => PhpSyntaxKind::Elseif,
314                    "empty" => PhpSyntaxKind::Empty,
315                    "enddeclare" => PhpSyntaxKind::Enddeclare,
316                    "endfor" => PhpSyntaxKind::Endfor,
317                    "endforeach" => PhpSyntaxKind::Endforeach,
318                    "endif" => PhpSyntaxKind::Endif,
319                    "endswitch" => PhpSyntaxKind::Endswitch,
320                    "endwhile" => PhpSyntaxKind::Endwhile,
321                    "eval" => PhpSyntaxKind::Eval,
322                    "exit" => PhpSyntaxKind::Exit,
323                    "extends" => PhpSyntaxKind::Extends,
324                    "final" => PhpSyntaxKind::Final,
325                    "finally" => PhpSyntaxKind::Finally,
326                    "for" => PhpSyntaxKind::For,
327                    "foreach" => PhpSyntaxKind::Foreach,
328                    "function" => PhpSyntaxKind::Function,
329                    "global" => PhpSyntaxKind::Global,
330                    "goto" => PhpSyntaxKind::Goto,
331                    "if" => PhpSyntaxKind::If,
332                    "implements" => PhpSyntaxKind::Implements,
333                    "include" => PhpSyntaxKind::Include,
334                    "include_once" => PhpSyntaxKind::IncludeOnce,
335                    "instanceof" => PhpSyntaxKind::Instanceof,
336                    "insteadof" => PhpSyntaxKind::Insteadof,
337                    "interface" => PhpSyntaxKind::Interface,
338                    "isset" => PhpSyntaxKind::Isset,
339                    "list" => PhpSyntaxKind::List,
340                    "namespace" => PhpSyntaxKind::Namespace,
341                    "new" => PhpSyntaxKind::New,
342                    "or" => PhpSyntaxKind::Or,
343                    "print" => PhpSyntaxKind::Print,
344                    "private" => PhpSyntaxKind::Private,
345                    "protected" => PhpSyntaxKind::Protected,
346                    "public" => PhpSyntaxKind::Public,
347                    "require" => PhpSyntaxKind::Require,
348                    "require_once" => PhpSyntaxKind::RequireOnce,
349                    "return" => PhpSyntaxKind::Return,
350                    "static" => PhpSyntaxKind::Static,
351                    "switch" => PhpSyntaxKind::Switch,
352                    "throw" => PhpSyntaxKind::Throw,
353                    "trait" => PhpSyntaxKind::Trait,
354                    "try" => PhpSyntaxKind::Try,
355                    "unset" => PhpSyntaxKind::Unset,
356                    "use" => PhpSyntaxKind::Use,
357                    "var" => PhpSyntaxKind::Var,
358                    "while" => PhpSyntaxKind::While,
359                    "xor" => PhpSyntaxKind::Xor,
360                    "yield" => PhpSyntaxKind::Yield,
361                    "true" => PhpSyntaxKind::BooleanLiteral,
362                    "false" => PhpSyntaxKind::BooleanLiteral,
363                    "null" => PhpSyntaxKind::NullLiteral,
364                    _ => PhpSyntaxKind::Identifier,
365                };
366
367                state.add_token(kind, start_pos, state.get_position());
368                true
369            }
370            else {
371                false
372            }
373        }
374        else {
375            false
376        }
377    }
378
379    fn lex_operators_and_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
380        if let Some(ch) = state.peek() {
381            let start_pos = state.get_position();
382
383            let kind = match ch {
384                '+' => {
385                    state.advance(1);
386                    if let Some('+') = state.peek() {
387                        state.advance(1);
388                        PhpSyntaxKind::Increment
389                    }
390                    else if let Some('=') = state.peek() {
391                        state.advance(1);
392                        PhpSyntaxKind::PlusAssign
393                    }
394                    else {
395                        PhpSyntaxKind::Plus
396                    }
397                }
398                '-' => {
399                    state.advance(1);
400                    if let Some('-') = state.peek() {
401                        state.advance(1);
402                        PhpSyntaxKind::Decrement
403                    }
404                    else if let Some('=') = state.peek() {
405                        state.advance(1);
406                        PhpSyntaxKind::MinusAssign
407                    }
408                    else if let Some('>') = state.peek() {
409                        state.advance(1);
410                        PhpSyntaxKind::Arrow
411                    }
412                    else {
413                        PhpSyntaxKind::Minus
414                    }
415                }
416                '*' => {
417                    state.advance(1);
418                    if let Some('*') = state.peek() {
419                        state.advance(1);
420                        PhpSyntaxKind::Power
421                    }
422                    else if let Some('=') = state.peek() {
423                        state.advance(1);
424                        PhpSyntaxKind::MultiplyAssign
425                    }
426                    else {
427                        PhpSyntaxKind::Multiply
428                    }
429                }
430                '/' => {
431                    state.advance(1);
432                    if let Some('=') = state.peek() {
433                        state.advance(1);
434                        PhpSyntaxKind::DivideAssign
435                    }
436                    else {
437                        PhpSyntaxKind::Divide
438                    }
439                }
440                '%' => {
441                    state.advance(1);
442                    if let Some('=') = state.peek() {
443                        state.advance(1);
444                        PhpSyntaxKind::ModuloAssign
445                    }
446                    else {
447                        PhpSyntaxKind::Modulo
448                    }
449                }
450                '=' => {
451                    state.advance(1);
452                    if let Some('=') = state.peek() {
453                        state.advance(1);
454                        if let Some('=') = state.peek() {
455                            state.advance(1);
456                            PhpSyntaxKind::Identical
457                        }
458                        else {
459                            PhpSyntaxKind::Equal
460                        }
461                    }
462                    else if let Some('>') = state.peek() {
463                        state.advance(1);
464                        PhpSyntaxKind::DoubleArrow
465                    }
466                    else {
467                        PhpSyntaxKind::Assign
468                    }
469                }
470                '!' => {
471                    state.advance(1);
472                    if let Some('=') = state.peek() {
473                        state.advance(1);
474                        if let Some('=') = state.peek() {
475                            state.advance(1);
476                            PhpSyntaxKind::NotIdentical
477                        }
478                        else {
479                            PhpSyntaxKind::NotEqual
480                        }
481                    }
482                    else {
483                        PhpSyntaxKind::LogicalNot
484                    }
485                }
486                '<' => {
487                    state.advance(1);
488                    if let Some('=') = state.peek() {
489                        state.advance(1);
490                        PhpSyntaxKind::LessEqual
491                    }
492                    else if let Some('<') = state.peek() {
493                        state.advance(1);
494                        if let Some('=') = state.peek() {
495                            state.advance(1);
496                            PhpSyntaxKind::LeftShiftAssign
497                        }
498                        else {
499                            PhpSyntaxKind::LeftShift
500                        }
501                    }
502                    else if let Some('>') = state.peek() {
503                        state.advance(1);
504                        PhpSyntaxKind::Spaceship
505                    }
506                    else {
507                        PhpSyntaxKind::Less
508                    }
509                }
510                '>' => {
511                    state.advance(1);
512                    if let Some('=') = state.peek() {
513                        state.advance(1);
514                        PhpSyntaxKind::GreaterEqual
515                    }
516                    else if let Some('>') = state.peek() {
517                        state.advance(1);
518                        if let Some('=') = state.peek() {
519                            state.advance(1);
520                            PhpSyntaxKind::RightShiftAssign
521                        }
522                        else {
523                            PhpSyntaxKind::RightShift
524                        }
525                    }
526                    else {
527                        PhpSyntaxKind::Greater
528                    }
529                }
530                '&' => {
531                    state.advance(1);
532                    if let Some('&') = state.peek() {
533                        state.advance(1);
534                        PhpSyntaxKind::LogicalAnd
535                    }
536                    else if let Some('=') = state.peek() {
537                        state.advance(1);
538                        PhpSyntaxKind::BitwiseAndAssign
539                    }
540                    else {
541                        PhpSyntaxKind::BitwiseAnd
542                    }
543                }
544                '|' => {
545                    state.advance(1);
546                    if let Some('|') = state.peek() {
547                        state.advance(1);
548                        PhpSyntaxKind::LogicalOr
549                    }
550                    else if let Some('=') = state.peek() {
551                        state.advance(1);
552                        PhpSyntaxKind::BitwiseOrAssign
553                    }
554                    else {
555                        PhpSyntaxKind::BitwiseOr
556                    }
557                }
558                '^' => {
559                    state.advance(1);
560                    if let Some('=') = state.peek() {
561                        state.advance(1);
562                        PhpSyntaxKind::BitwiseXorAssign
563                    }
564                    else {
565                        PhpSyntaxKind::BitwiseXor
566                    }
567                }
568                '~' => {
569                    state.advance(1);
570                    PhpSyntaxKind::BitwiseNot
571                }
572                '?' => {
573                    state.advance(1);
574                    if let Some('?') = state.peek() {
575                        state.advance(1);
576                        PhpSyntaxKind::NullCoalesce
577                    }
578                    else {
579                        PhpSyntaxKind::Question
580                    }
581                }
582                ':' => {
583                    state.advance(1);
584                    if let Some(':') = state.peek() {
585                        state.advance(1);
586                        PhpSyntaxKind::DoubleColon
587                    }
588                    else {
589                        PhpSyntaxKind::Colon
590                    }
591                }
592                ';' => {
593                    state.advance(1);
594                    PhpSyntaxKind::Semicolon
595                }
596                ',' => {
597                    state.advance(1);
598                    PhpSyntaxKind::Comma
599                }
600                '.' => {
601                    state.advance(1);
602                    if let Some('=') = state.peek() {
603                        state.advance(1);
604                        PhpSyntaxKind::ConcatAssign
605                    }
606                    else {
607                        PhpSyntaxKind::Dot
608                    }
609                }
610                '(' => {
611                    state.advance(1);
612                    PhpSyntaxKind::LeftParen
613                }
614                ')' => {
615                    state.advance(1);
616                    PhpSyntaxKind::RightParen
617                }
618                '[' => {
619                    state.advance(1);
620                    PhpSyntaxKind::LeftBracket
621                }
622                ']' => {
623                    state.advance(1);
624                    PhpSyntaxKind::RightBracket
625                }
626                '{' => {
627                    state.advance(1);
628                    PhpSyntaxKind::LeftBrace
629                }
630                '}' => {
631                    state.advance(1);
632                    PhpSyntaxKind::RightBrace
633                }
634                '$' => {
635                    state.advance(1);
636                    PhpSyntaxKind::Dollar
637                }
638                '@' => {
639                    state.advance(1);
640                    PhpSyntaxKind::At
641                }
642                _ => return false,
643            };
644
645            state.add_token(kind, start_pos, state.get_position());
646            true
647        }
648        else {
649            false
650        }
651    }
652}