oak_php/lexer/
mod.rs

1use crate::{kind::PhpSyntaxKind, language::PhpLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<'s, S> = LexerState<'s, S, PhpLanguage>;
5
6#[derive(Clone)]
7pub struct PhpLexer<'config> {
8    _config: &'config PhpLanguage,
9}
10
11impl<'config> PhpLexer<'config> {
12    pub fn new(config: &'config PhpLanguage) -> Self {
13        Self { _config: config }
14    }
15
16    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
17        while state.not_at_end() {
18            if self.skip_whitespace(state) {
19                continue;
20            }
21
22            if self.lex_newline(state) {
23                continue;
24            }
25
26            if self.lex_comment(state) {
27                continue;
28            }
29
30            if self.lex_string(state) {
31                continue;
32            }
33
34            if self.lex_number(state) {
35                continue;
36            }
37
38            if self.lex_identifier_or_keyword(state) {
39                continue;
40            }
41
42            if self.lex_operators_and_punctuation(state) {
43                continue;
44            }
45
46            // 如果没有匹配任何规则,跳过当前字符
47            if let Some(ch) = state.peek() {
48                let start_pos = state.get_position();
49                state.advance(ch.len_utf8());
50                state.add_token(PhpSyntaxKind::Error, start_pos, state.get_position());
51            }
52            else {
53                // 如果已到达文件末尾,退出循环
54                break;
55            }
56        }
57
58        // Add EOF token
59        state.add_eof();
60
61        Ok(())
62    }
63
64    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
65        let start_pos = state.get_position();
66
67        while let Some(ch) = state.peek() {
68            if ch == ' ' || ch == '\t' {
69                state.advance(ch.len_utf8());
70            }
71            else {
72                break;
73            }
74        }
75
76        if state.get_position() > start_pos {
77            state.add_token(PhpSyntaxKind::Whitespace, start_pos, state.get_position());
78            true
79        }
80        else {
81            false
82        }
83    }
84
85    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
86        let start_pos = state.get_position();
87
88        if let Some('\n') = state.peek() {
89            state.advance(1);
90            state.add_token(PhpSyntaxKind::Newline, start_pos, state.get_position());
91            true
92        }
93        else if let Some('\r') = state.peek() {
94            state.advance(1);
95            if let Some('\n') = state.peek() {
96                state.advance(1);
97            }
98            state.add_token(PhpSyntaxKind::Newline, start_pos, state.get_position());
99            true
100        }
101        else {
102            false
103        }
104    }
105
106    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
107        let start_pos = state.get_position();
108
109        if let Some('/') = state.peek() {
110            state.advance(1);
111            if let Some('/') = state.peek() {
112                state.advance(1);
113                // 单行注释
114                while let Some(ch) = state.peek() {
115                    if ch == '\n' || ch == '\r' {
116                        break;
117                    }
118                    state.advance(ch.len_utf8());
119                }
120                state.add_token(PhpSyntaxKind::Comment, start_pos, state.get_position());
121                return true;
122            }
123            else if let Some('*') = state.peek() {
124                state.advance(1);
125                // 多行注释
126                while let Some(ch) = state.peek() {
127                    if ch == '*' {
128                        state.advance(1);
129                        if let Some('/') = state.peek() {
130                            state.advance(1);
131                            break;
132                        }
133                    }
134                    else {
135                        state.advance(ch.len_utf8());
136                    }
137                }
138                state.add_token(PhpSyntaxKind::Comment, start_pos, state.get_position());
139                return true;
140            }
141            else {
142                // 回退,这不是注释
143                state.set_position(start_pos);
144                return false;
145            }
146        }
147        else if let Some('#') = state.peek() {
148            state.advance(1);
149            // PHP 风格的单行注释
150            while let Some(ch) = state.peek() {
151                if ch == '\n' || ch == '\r' {
152                    break;
153                }
154                state.advance(ch.len_utf8());
155            }
156            state.add_token(PhpSyntaxKind::Comment, start_pos, state.get_position());
157            true
158        }
159        else {
160            false
161        }
162    }
163
164    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
165        let start_pos = state.get_position();
166
167        if let Some(quote_char) = state.peek() {
168            if quote_char == '"' || quote_char == '\'' {
169                state.advance(1); // 跳过开始引号
170
171                let mut escaped = false;
172                while let Some(ch) = state.peek() {
173                    if escaped {
174                        escaped = false;
175                        state.advance(ch.len_utf8());
176                    }
177                    else if ch == '\\' {
178                        escaped = true;
179                        state.advance(1);
180                    }
181                    else if ch == quote_char {
182                        state.advance(1); // 跳过结束引号
183                        break;
184                    }
185                    else if ch == '\n' || ch == '\r' {
186                        // 字符串不能跨行(除非转义)
187                        break;
188                    }
189                    else {
190                        state.advance(ch.len_utf8());
191                    }
192                }
193
194                state.add_token(PhpSyntaxKind::StringLiteral, start_pos, state.get_position());
195                true
196            }
197            else {
198                false
199            }
200        }
201        else {
202            false
203        }
204    }
205
206    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
207        if let Some(ch) = state.peek() {
208            if ch.is_ascii_digit() {
209                let start_pos = state.get_position();
210
211                // 读取整数部分
212                while let Some(ch) = state.peek() {
213                    if ch.is_ascii_digit() {
214                        state.advance(1);
215                    }
216                    else {
217                        break;
218                    }
219                }
220
221                // 检查小数点
222                if let Some('.') = state.peek() {
223                    state.advance(1);
224                    // 读取小数部分
225                    while let Some(ch) = state.peek() {
226                        if ch.is_ascii_digit() {
227                            state.advance(1);
228                        }
229                        else {
230                            break;
231                        }
232                    }
233                }
234
235                // 检查科学记数法
236                if let Some(ch) = state.peek() {
237                    if ch == 'e' || ch == 'E' {
238                        state.advance(1);
239                        if let Some(ch) = state.peek() {
240                            if ch == '+' || ch == '-' {
241                                state.advance(1);
242                            }
243                        }
244                        while let Some(ch) = state.peek() {
245                            if ch.is_ascii_digit() {
246                                state.advance(1);
247                            }
248                            else {
249                                break;
250                            }
251                        }
252                    }
253                }
254
255                state.add_token(PhpSyntaxKind::NumberLiteral, start_pos, state.get_position());
256                true
257            }
258            else {
259                false
260            }
261        }
262        else {
263            false
264        }
265    }
266
267    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
268        if let Some(ch) = state.peek() {
269            if ch.is_alphabetic() || ch == '_' || ch == '$' {
270                let start_pos = state.get_position();
271
272                // 读取标识符
273                while let Some(ch) = state.peek() {
274                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
275                        state.advance(ch.len_utf8());
276                    }
277                    else {
278                        break;
279                    }
280                }
281
282                let end_pos = state.get_position();
283                let text = state.source().get_text_in(oak_core::Range { start: start_pos, end: end_pos });
284
285                // 检查是否是关键字
286                let kind = match text.as_ref() {
287                    "abstract" => PhpSyntaxKind::Abstract,
288                    "and" => PhpSyntaxKind::And,
289                    "array" => PhpSyntaxKind::Array,
290                    "as" => PhpSyntaxKind::As,
291                    "break" => PhpSyntaxKind::Break,
292                    "callable" => PhpSyntaxKind::Callable,
293                    "case" => PhpSyntaxKind::Case,
294                    "catch" => PhpSyntaxKind::Catch,
295                    "class" => PhpSyntaxKind::Class,
296                    "clone" => PhpSyntaxKind::Clone,
297                    "const" => PhpSyntaxKind::Const,
298                    "continue" => PhpSyntaxKind::Continue,
299                    "declare" => PhpSyntaxKind::Declare,
300                    "default" => PhpSyntaxKind::Default,
301                    "die" => PhpSyntaxKind::Exit,
302                    "do" => PhpSyntaxKind::Do,
303                    "echo" => PhpSyntaxKind::Echo,
304                    "else" => PhpSyntaxKind::Else,
305                    "elseif" => PhpSyntaxKind::Elseif,
306                    "empty" => PhpSyntaxKind::Empty,
307                    "enddeclare" => PhpSyntaxKind::Enddeclare,
308                    "endfor" => PhpSyntaxKind::Endfor,
309                    "endforeach" => PhpSyntaxKind::Endforeach,
310                    "endif" => PhpSyntaxKind::Endif,
311                    "endswitch" => PhpSyntaxKind::Endswitch,
312                    "endwhile" => PhpSyntaxKind::Endwhile,
313                    "eval" => PhpSyntaxKind::Eval,
314                    "exit" => PhpSyntaxKind::Exit,
315                    "extends" => PhpSyntaxKind::Extends,
316                    "final" => PhpSyntaxKind::Final,
317                    "finally" => PhpSyntaxKind::Finally,
318                    "for" => PhpSyntaxKind::For,
319                    "foreach" => PhpSyntaxKind::Foreach,
320                    "function" => PhpSyntaxKind::Function,
321                    "global" => PhpSyntaxKind::Global,
322                    "goto" => PhpSyntaxKind::Goto,
323                    "if" => PhpSyntaxKind::If,
324                    "implements" => PhpSyntaxKind::Implements,
325                    "include" => PhpSyntaxKind::Include,
326                    "include_once" => PhpSyntaxKind::IncludeOnce,
327                    "instanceof" => PhpSyntaxKind::Instanceof,
328                    "insteadof" => PhpSyntaxKind::Insteadof,
329                    "interface" => PhpSyntaxKind::Interface,
330                    "isset" => PhpSyntaxKind::Isset,
331                    "list" => PhpSyntaxKind::List,
332                    "namespace" => PhpSyntaxKind::Namespace,
333                    "new" => PhpSyntaxKind::New,
334                    "or" => PhpSyntaxKind::Or,
335                    "print" => PhpSyntaxKind::Print,
336                    "private" => PhpSyntaxKind::Private,
337                    "protected" => PhpSyntaxKind::Protected,
338                    "public" => PhpSyntaxKind::Public,
339                    "require" => PhpSyntaxKind::Require,
340                    "require_once" => PhpSyntaxKind::RequireOnce,
341                    "return" => PhpSyntaxKind::Return,
342                    "static" => PhpSyntaxKind::Static,
343                    "switch" => PhpSyntaxKind::Switch,
344                    "throw" => PhpSyntaxKind::Throw,
345                    "trait" => PhpSyntaxKind::Trait,
346                    "try" => PhpSyntaxKind::Try,
347                    "unset" => PhpSyntaxKind::Unset,
348                    "use" => PhpSyntaxKind::Use,
349                    "var" => PhpSyntaxKind::Var,
350                    "while" => PhpSyntaxKind::While,
351                    "xor" => PhpSyntaxKind::Xor,
352                    "yield" => PhpSyntaxKind::Yield,
353                    "true" => PhpSyntaxKind::BooleanLiteral,
354                    "false" => PhpSyntaxKind::BooleanLiteral,
355                    "null" => PhpSyntaxKind::NullLiteral,
356                    _ => PhpSyntaxKind::Identifier,
357                };
358
359                state.add_token(kind, start_pos, state.get_position());
360                true
361            }
362            else {
363                false
364            }
365        }
366        else {
367            false
368        }
369    }
370
371    fn lex_operators_and_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
372        if let Some(ch) = state.peek() {
373            let start_pos = state.get_position();
374
375            let kind = match ch {
376                '+' => {
377                    state.advance(1);
378                    if let Some('+') = state.peek() {
379                        state.advance(1);
380                        PhpSyntaxKind::Increment
381                    }
382                    else if let Some('=') = state.peek() {
383                        state.advance(1);
384                        PhpSyntaxKind::PlusAssign
385                    }
386                    else {
387                        PhpSyntaxKind::Plus
388                    }
389                }
390                '-' => {
391                    state.advance(1);
392                    if let Some('-') = state.peek() {
393                        state.advance(1);
394                        PhpSyntaxKind::Decrement
395                    }
396                    else if let Some('=') = state.peek() {
397                        state.advance(1);
398                        PhpSyntaxKind::MinusAssign
399                    }
400                    else if let Some('>') = state.peek() {
401                        state.advance(1);
402                        PhpSyntaxKind::Arrow
403                    }
404                    else {
405                        PhpSyntaxKind::Minus
406                    }
407                }
408                '*' => {
409                    state.advance(1);
410                    if let Some('*') = state.peek() {
411                        state.advance(1);
412                        PhpSyntaxKind::Power
413                    }
414                    else if let Some('=') = state.peek() {
415                        state.advance(1);
416                        PhpSyntaxKind::MultiplyAssign
417                    }
418                    else {
419                        PhpSyntaxKind::Multiply
420                    }
421                }
422                '/' => {
423                    state.advance(1);
424                    if let Some('=') = state.peek() {
425                        state.advance(1);
426                        PhpSyntaxKind::DivideAssign
427                    }
428                    else {
429                        PhpSyntaxKind::Divide
430                    }
431                }
432                '%' => {
433                    state.advance(1);
434                    if let Some('=') = state.peek() {
435                        state.advance(1);
436                        PhpSyntaxKind::ModuloAssign
437                    }
438                    else {
439                        PhpSyntaxKind::Modulo
440                    }
441                }
442                '=' => {
443                    state.advance(1);
444                    if let Some('=') = state.peek() {
445                        state.advance(1);
446                        if let Some('=') = state.peek() {
447                            state.advance(1);
448                            PhpSyntaxKind::Identical
449                        }
450                        else {
451                            PhpSyntaxKind::Equal
452                        }
453                    }
454                    else if let Some('>') = state.peek() {
455                        state.advance(1);
456                        PhpSyntaxKind::DoubleArrow
457                    }
458                    else {
459                        PhpSyntaxKind::Assign
460                    }
461                }
462                '!' => {
463                    state.advance(1);
464                    if let Some('=') = state.peek() {
465                        state.advance(1);
466                        if let Some('=') = state.peek() {
467                            state.advance(1);
468                            PhpSyntaxKind::NotIdentical
469                        }
470                        else {
471                            PhpSyntaxKind::NotEqual
472                        }
473                    }
474                    else {
475                        PhpSyntaxKind::LogicalNot
476                    }
477                }
478                '<' => {
479                    state.advance(1);
480                    if let Some('=') = state.peek() {
481                        state.advance(1);
482                        PhpSyntaxKind::LessEqual
483                    }
484                    else if let Some('<') = state.peek() {
485                        state.advance(1);
486                        if let Some('=') = state.peek() {
487                            state.advance(1);
488                            PhpSyntaxKind::LeftShiftAssign
489                        }
490                        else {
491                            PhpSyntaxKind::LeftShift
492                        }
493                    }
494                    else if let Some('>') = state.peek() {
495                        state.advance(1);
496                        PhpSyntaxKind::Spaceship
497                    }
498                    else {
499                        PhpSyntaxKind::Less
500                    }
501                }
502                '>' => {
503                    state.advance(1);
504                    if let Some('=') = state.peek() {
505                        state.advance(1);
506                        PhpSyntaxKind::GreaterEqual
507                    }
508                    else if let Some('>') = state.peek() {
509                        state.advance(1);
510                        if let Some('=') = state.peek() {
511                            state.advance(1);
512                            PhpSyntaxKind::RightShiftAssign
513                        }
514                        else {
515                            PhpSyntaxKind::RightShift
516                        }
517                    }
518                    else {
519                        PhpSyntaxKind::Greater
520                    }
521                }
522                '&' => {
523                    state.advance(1);
524                    if let Some('&') = state.peek() {
525                        state.advance(1);
526                        PhpSyntaxKind::LogicalAnd
527                    }
528                    else if let Some('=') = state.peek() {
529                        state.advance(1);
530                        PhpSyntaxKind::BitwiseAndAssign
531                    }
532                    else {
533                        PhpSyntaxKind::BitwiseAnd
534                    }
535                }
536                '|' => {
537                    state.advance(1);
538                    if let Some('|') = state.peek() {
539                        state.advance(1);
540                        PhpSyntaxKind::LogicalOr
541                    }
542                    else if let Some('=') = state.peek() {
543                        state.advance(1);
544                        PhpSyntaxKind::BitwiseOrAssign
545                    }
546                    else {
547                        PhpSyntaxKind::BitwiseOr
548                    }
549                }
550                '^' => {
551                    state.advance(1);
552                    if let Some('=') = state.peek() {
553                        state.advance(1);
554                        PhpSyntaxKind::BitwiseXorAssign
555                    }
556                    else {
557                        PhpSyntaxKind::BitwiseXor
558                    }
559                }
560                '~' => {
561                    state.advance(1);
562                    PhpSyntaxKind::BitwiseNot
563                }
564                '?' => {
565                    state.advance(1);
566                    if let Some('?') = state.peek() {
567                        state.advance(1);
568                        PhpSyntaxKind::NullCoalesce
569                    }
570                    else {
571                        PhpSyntaxKind::Question
572                    }
573                }
574                ':' => {
575                    state.advance(1);
576                    if let Some(':') = state.peek() {
577                        state.advance(1);
578                        PhpSyntaxKind::DoubleColon
579                    }
580                    else {
581                        PhpSyntaxKind::Colon
582                    }
583                }
584                ';' => {
585                    state.advance(1);
586                    PhpSyntaxKind::Semicolon
587                }
588                ',' => {
589                    state.advance(1);
590                    PhpSyntaxKind::Comma
591                }
592                '.' => {
593                    state.advance(1);
594                    if let Some('=') = state.peek() {
595                        state.advance(1);
596                        PhpSyntaxKind::ConcatAssign
597                    }
598                    else {
599                        PhpSyntaxKind::Dot
600                    }
601                }
602                '(' => {
603                    state.advance(1);
604                    PhpSyntaxKind::LeftParen
605                }
606                ')' => {
607                    state.advance(1);
608                    PhpSyntaxKind::RightParen
609                }
610                '[' => {
611                    state.advance(1);
612                    PhpSyntaxKind::LeftBracket
613                }
614                ']' => {
615                    state.advance(1);
616                    PhpSyntaxKind::RightBracket
617                }
618                '{' => {
619                    state.advance(1);
620                    PhpSyntaxKind::LeftBrace
621                }
622                '}' => {
623                    state.advance(1);
624                    PhpSyntaxKind::RightBrace
625                }
626                '$' => {
627                    state.advance(1);
628                    PhpSyntaxKind::Dollar
629                }
630                '@' => {
631                    state.advance(1);
632                    PhpSyntaxKind::At
633                }
634                _ => return false,
635            };
636
637            state.add_token(kind, start_pos, state.get_position());
638            true
639        }
640        else {
641            false
642        }
643    }
644}
645
646impl<'config> Lexer<PhpLanguage> for PhpLexer<'config> {
647    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<PhpLanguage>) -> LexOutput<PhpLanguage> {
648        let mut state: State<'_, S> = LexerState::new(source);
649        let result = self.run(&mut state);
650        state.finish_with_cache(result, cache)
651    }
652}