1#![doc = include_str!("readme.md")]
2pub mod token_type;
4use crate::language::PhpLanguage;
5use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
6pub use token_type::{PhpToken, PhpTokenType};
7
8type State<'s, S> = LexerState<'s, S, PhpLanguage>;
9
10#[derive(Clone, Debug)]
14pub struct PhpLexer<'config> {
15 config: &'config PhpLanguage,
16}
17
18impl<'config> Lexer<PhpLanguage> for PhpLexer<'config> {
19 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PhpLanguage>) -> LexOutput<PhpLanguage> {
20 let mut state = State::new_with_cache(source, 0, cache);
21 let result = self.run(&mut state);
22 if result.is_ok() {
23 state.add_eof();
24 }
25 state.finish_with_cache(result, cache)
26 }
27}
28
29impl<'config> PhpLexer<'config> {
30 pub fn new(config: &'config PhpLanguage) -> Self {
32 Self { config }
33 }
34
35 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
36 while state.not_at_end() {
37 if self.skip_whitespace(state) {
38 continue;
39 }
40
41 if self.lex_newline(state) {
42 continue;
43 }
44
45 if self.lex_tags(state) {
46 continue;
47 }
48
49 if self.lex_comment(state) {
50 continue;
51 }
52
53 if self.lex_string(state) {
54 continue;
55 }
56
57 if self.lex_number(state) {
58 continue;
59 }
60
61 if self.lex_identifier_or_keyword(state) {
62 continue;
63 }
64
65 if self.lex_operators_and_punctuation(state) {
66 continue;
67 }
68
69 if let Some(ch) = state.peek() {
71 let start_pos = state.get_position();
72 state.advance(ch.len_utf8());
73 state.add_token(PhpTokenType::Error, start_pos, state.get_position())
74 }
75 else {
76 break;
78 }
79 }
80
81 Ok(())
82 }
83
84 fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
85 let start_pos = state.get_position();
86
87 while let Some(ch) = state.peek() {
88 if ch == ' ' || ch == '\t' {
89 state.advance(ch.len_utf8())
90 }
91 else {
92 break;
93 }
94 }
95
96 if state.get_position() > start_pos {
97 state.add_token(PhpTokenType::Whitespace, start_pos, state.get_position());
98 true
99 }
100 else {
101 false
102 }
103 }
104
105 fn lex_tags<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
106 let start_pos = state.get_position();
107 let rest = state.rest();
108
109 if rest.starts_with(&self.config.tag_start) {
110 state.advance(self.config.tag_start.len());
111 state.add_token(PhpTokenType::OpenTag, start_pos, state.get_position());
112 return true;
113 }
114
115 if rest.starts_with(&self.config.echo_tag_start) {
116 state.advance(self.config.echo_tag_start.len());
117 state.add_token(PhpTokenType::EchoTag, start_pos, state.get_position());
118 return true;
119 }
120
121 if rest.starts_with(&self.config.short_tag_start) {
122 state.advance(self.config.short_tag_start.len());
123 state.add_token(PhpTokenType::OpenTag, start_pos, state.get_position());
124 return true;
125 }
126
127 if rest.starts_with(&self.config.tag_end) {
128 state.advance(self.config.tag_end.len());
129 state.add_token(PhpTokenType::CloseTag, start_pos, state.get_position());
130 return true;
131 }
132
133 false
134 }
135
136 fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
137 let start_pos = state.get_position();
138
139 if let Some('\n') = state.peek() {
140 state.advance(1);
141 state.add_token(PhpTokenType::Newline, start_pos, state.get_position());
142 true
143 }
144 else if let Some('\r') = state.peek() {
145 state.advance(1);
146 if let Some('\n') = state.peek() {
147 state.advance(1)
148 }
149 state.add_token(PhpTokenType::Newline, start_pos, state.get_position());
150 true
151 }
152 else {
153 false
154 }
155 }
156
157 fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
158 let start_pos = state.get_position();
159
160 if let Some('/') = state.peek() {
161 state.advance(1);
162 if let Some('/') = state.peek() {
163 state.advance(1);
164 while let Some(ch) = state.peek() {
166 if ch == '\n' || ch == '\r' {
167 break;
168 }
169 state.advance(ch.len_utf8())
170 }
171 state.add_token(PhpTokenType::Comment, start_pos, state.get_position());
172 return true;
173 }
174 else if let Some('*') = state.peek() {
175 state.advance(1);
176 while let Some(ch) = state.peek() {
178 if ch == '*' {
179 state.advance(1);
180 if let Some('/') = state.peek() {
181 state.advance(1);
182 break;
183 }
184 }
185 else {
186 state.advance(ch.len_utf8())
187 }
188 }
189 state.add_token(PhpTokenType::Comment, start_pos, state.get_position());
190 return true;
191 }
192 else {
193 state.set_position(start_pos);
195 return false;
196 }
197 }
198 else if let Some('#') = state.peek() {
199 state.advance(1);
200 while let Some(ch) = state.peek() {
202 if ch == '\n' || ch == '\r' {
203 break;
204 }
205 state.advance(ch.len_utf8())
206 }
207 state.add_token(PhpTokenType::Comment, start_pos, state.get_position());
208 true
209 }
210 else {
211 false
212 }
213 }
214
215 fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
216 let start_pos = state.get_position();
217
218 if let Some(quote_char) = state.peek() {
219 if quote_char == '"' || quote_char == '\'' {
220 state.advance(1); let mut escaped = false;
223 while let Some(ch) = state.peek() {
224 if escaped {
225 escaped = false;
226 state.advance(ch.len_utf8())
227 }
228 else if ch == '\\' {
229 escaped = true;
230 state.advance(1)
231 }
232 else if ch == quote_char {
233 state.advance(1); break;
235 }
236 else if ch == '\n' || ch == '\r' {
237 break;
239 }
240 else {
241 state.advance(ch.len_utf8())
242 }
243 }
244
245 state.add_token(PhpTokenType::StringLiteral, start_pos, state.get_position());
246 true
247 }
248 else {
249 false
250 }
251 }
252 else {
253 false
254 }
255 }
256
257 fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
258 if let Some(ch) = state.peek() {
259 if ch.is_ascii_digit() {
260 let start_pos = state.get_position();
261
262 while let Some(ch) = state.peek() {
264 if ch.is_ascii_digit() {
265 state.advance(1)
266 }
267 else {
268 break;
269 }
270 }
271
272 if let Some('.') = state.peek() {
274 state.advance(1);
275 while let Some(ch) = state.peek() {
277 if ch.is_ascii_digit() {
278 state.advance(1)
279 }
280 else {
281 break;
282 }
283 }
284 }
285
286 if let Some(ch) = state.peek() {
288 if ch == 'e' || ch == 'E' {
289 state.advance(1);
290 if let Some(ch) = state.peek() {
291 if ch == '+' || ch == '-' {
292 state.advance(1)
293 }
294 }
295 while let Some(ch) = state.peek() {
296 if ch.is_ascii_digit() {
297 state.advance(1)
298 }
299 else {
300 break;
301 }
302 }
303 }
304 }
305
306 state.add_token(PhpTokenType::NumberLiteral, start_pos, state.get_position());
307 true
308 }
309 else {
310 false
311 }
312 }
313 else {
314 false
315 }
316 }
317
318 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
319 if let Some(ch) = state.peek() {
320 if ch.is_alphabetic() || ch == '_' || ch == '$' {
321 let start_pos = state.get_position();
322
323 while let Some(ch) = state.peek() {
325 if ch.is_alphanumeric() || ch == '_' || ch == '$' {
326 state.advance(ch.len_utf8())
327 }
328 else {
329 break;
330 }
331 }
332
333 let end_pos = state.get_position();
334 let text = state.source().get_text_in(oak_core::Range { start: start_pos, end: end_pos });
335
336 let kind = match text.as_ref() {
338 "abstract" => PhpTokenType::Abstract,
339 "and" => PhpTokenType::And,
340 "array" => PhpTokenType::Array,
341 "as" => PhpTokenType::As,
342 "break" => PhpTokenType::Break,
343 "callable" => PhpTokenType::Callable,
344 "case" => PhpTokenType::Case,
345 "catch" => PhpTokenType::Catch,
346 "class" => PhpTokenType::Class,
347 "clone" => PhpTokenType::Clone,
348 "const" => PhpTokenType::Const,
349 "continue" => PhpTokenType::Continue,
350 "declare" => PhpTokenType::Declare,
351 "default" => PhpTokenType::Default,
352 "die" => PhpTokenType::Exit,
353 "do" => PhpTokenType::Do,
354 "echo" => PhpTokenType::Echo,
355 "else" => PhpTokenType::Else,
356 "elseif" => PhpTokenType::Elseif,
357 "empty" => PhpTokenType::Empty,
358 "enddeclare" => PhpTokenType::Enddeclare,
359 "endfor" => PhpTokenType::Endfor,
360 "endforeach" => PhpTokenType::Endforeach,
361 "endif" => PhpTokenType::Endif,
362 "endswitch" => PhpTokenType::Endswitch,
363 "endwhile" => PhpTokenType::Endwhile,
364 "eval" => PhpTokenType::Eval,
365 "exit" => PhpTokenType::Exit,
366 "extends" => PhpTokenType::Extends,
367 "final" => PhpTokenType::Final,
368 "finally" => PhpTokenType::Finally,
369 "for" => PhpTokenType::For,
370 "foreach" => PhpTokenType::Foreach,
371 "function" => PhpTokenType::Function,
372 "global" => PhpTokenType::Global,
373 "goto" => PhpTokenType::Goto,
374 "if" => PhpTokenType::If,
375 "implements" => PhpTokenType::Implements,
376 "include" => PhpTokenType::Include,
377 "include_once" => PhpTokenType::IncludeOnce,
378 "instanceof" => PhpTokenType::Instanceof,
379 "insteadof" => PhpTokenType::Insteadof,
380 "interface" => PhpTokenType::Interface,
381 "isset" => PhpTokenType::Isset,
382 "list" => PhpTokenType::List,
383 "namespace" => PhpTokenType::Namespace,
384 "new" => PhpTokenType::New,
385 "or" => PhpTokenType::Or,
386 "print" => PhpTokenType::Print,
387 "private" => PhpTokenType::Private,
388 "protected" => PhpTokenType::Protected,
389 "public" => PhpTokenType::Public,
390 "require" => PhpTokenType::Require,
391 "require_once" => PhpTokenType::RequireOnce,
392 "return" => PhpTokenType::Return,
393 "static" => PhpTokenType::Static,
394 "switch" => PhpTokenType::Switch,
395 "throw" => PhpTokenType::Throw,
396 "trait" => PhpTokenType::Trait,
397 "try" => PhpTokenType::Try,
398 "unset" => PhpTokenType::Unset,
399 "use" => PhpTokenType::Use,
400 "var" => PhpTokenType::Var,
401 "while" => PhpTokenType::While,
402 "xor" => PhpTokenType::Xor,
403 "yield" => PhpTokenType::Yield,
404 "true" => PhpTokenType::BooleanLiteral,
405 "false" => PhpTokenType::BooleanLiteral,
406 "null" => PhpTokenType::NullLiteral,
407 _ => PhpTokenType::Identifier,
408 };
409
410 state.add_token(kind, start_pos, state.get_position());
411 true
412 }
413 else {
414 false
415 }
416 }
417 else {
418 false
419 }
420 }
421
422 fn lex_operators_and_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
423 if let Some(ch) = state.peek() {
424 let start_pos = state.get_position();
425
426 let kind = match ch {
427 '+' => {
428 state.advance(1);
429 if let Some('+') = state.peek() {
430 state.advance(1);
431 PhpTokenType::Increment
432 }
433 else if let Some('=') = state.peek() {
434 state.advance(1);
435 PhpTokenType::PlusAssign
436 }
437 else {
438 PhpTokenType::Plus
439 }
440 }
441 '-' => {
442 state.advance(1);
443 if let Some('-') = state.peek() {
444 state.advance(1);
445 PhpTokenType::Decrement
446 }
447 else if let Some('=') = state.peek() {
448 state.advance(1);
449 PhpTokenType::MinusAssign
450 }
451 else if let Some('>') = state.peek() {
452 state.advance(1);
453 PhpTokenType::Arrow
454 }
455 else {
456 PhpTokenType::Minus
457 }
458 }
459 '*' => {
460 state.advance(1);
461 if let Some('*') = state.peek() {
462 state.advance(1);
463 PhpTokenType::Power
464 }
465 else if let Some('=') = state.peek() {
466 state.advance(1);
467 PhpTokenType::MultiplyAssign
468 }
469 else {
470 PhpTokenType::Multiply
471 }
472 }
473 '/' => {
474 state.advance(1);
475 if let Some('=') = state.peek() {
476 state.advance(1);
477 PhpTokenType::DivideAssign
478 }
479 else {
480 PhpTokenType::Divide
481 }
482 }
483 '%' => {
484 state.advance(1);
485 if let Some('=') = state.peek() {
486 state.advance(1);
487 PhpTokenType::ModuloAssign
488 }
489 else {
490 PhpTokenType::Modulo
491 }
492 }
493 '=' => {
494 state.advance(1);
495 if let Some('=') = state.peek() {
496 state.advance(1);
497 if let Some('=') = state.peek() {
498 state.advance(1);
499 PhpTokenType::Identical
500 }
501 else {
502 PhpTokenType::Equal
503 }
504 }
505 else if let Some('>') = state.peek() {
506 state.advance(1);
507 PhpTokenType::DoubleArrow
508 }
509 else {
510 PhpTokenType::Assign
511 }
512 }
513 '!' => {
514 state.advance(1);
515 if let Some('=') = state.peek() {
516 state.advance(1);
517 if let Some('=') = state.peek() {
518 state.advance(1);
519 PhpTokenType::NotIdentical
520 }
521 else {
522 PhpTokenType::NotEqual
523 }
524 }
525 else {
526 PhpTokenType::LogicalNot
527 }
528 }
529 '<' => {
530 state.advance(1);
531 if let Some('=') = state.peek() {
532 state.advance(1);
533 PhpTokenType::LessEqual
534 }
535 else if let Some('<') = state.peek() {
536 state.advance(1);
537 if let Some('=') = state.peek() {
538 state.advance(1);
539 PhpTokenType::LeftShiftAssign
540 }
541 else {
542 PhpTokenType::LeftShift
543 }
544 }
545 else if let Some('>') = state.peek() {
546 state.advance(1);
547 PhpTokenType::Spaceship
548 }
549 else {
550 PhpTokenType::Less
551 }
552 }
553 '>' => {
554 state.advance(1);
555 if let Some('=') = state.peek() {
556 state.advance(1);
557 PhpTokenType::GreaterEqual
558 }
559 else if let Some('>') = state.peek() {
560 state.advance(1);
561 if let Some('=') = state.peek() {
562 state.advance(1);
563 PhpTokenType::RightShiftAssign
564 }
565 else {
566 PhpTokenType::RightShift
567 }
568 }
569 else {
570 PhpTokenType::Greater
571 }
572 }
573 '&' => {
574 state.advance(1);
575 if let Some('&') = state.peek() {
576 state.advance(1);
577 PhpTokenType::LogicalAnd
578 }
579 else if let Some('=') = state.peek() {
580 state.advance(1);
581 PhpTokenType::BitwiseAndAssign
582 }
583 else {
584 PhpTokenType::BitwiseAnd
585 }
586 }
587 '|' => {
588 state.advance(1);
589 if let Some('|') = state.peek() {
590 state.advance(1);
591 PhpTokenType::LogicalOr
592 }
593 else if let Some('=') = state.peek() {
594 state.advance(1);
595 PhpTokenType::BitwiseOrAssign
596 }
597 else {
598 PhpTokenType::BitwiseOr
599 }
600 }
601 '^' => {
602 state.advance(1);
603 if let Some('=') = state.peek() {
604 state.advance(1);
605 PhpTokenType::BitwiseXorAssign
606 }
607 else {
608 PhpTokenType::BitwiseXor
609 }
610 }
611 '~' => {
612 state.advance(1);
613 PhpTokenType::BitwiseNot
614 }
615 '?' => {
616 state.advance(1);
617 if let Some('?') = state.peek() {
618 state.advance(1);
619 PhpTokenType::NullCoalesce
620 }
621 else {
622 PhpTokenType::Question
623 }
624 }
625 ':' => {
626 state.advance(1);
627 if let Some(':') = state.peek() {
628 state.advance(1);
629 PhpTokenType::DoubleColon
630 }
631 else {
632 PhpTokenType::Colon
633 }
634 }
635 ';' => {
636 state.advance(1);
637 PhpTokenType::Semicolon
638 }
639 ',' => {
640 state.advance(1);
641 PhpTokenType::Comma
642 }
643 '.' => {
644 state.advance(1);
645 if let Some('=') = state.peek() {
646 state.advance(1);
647 PhpTokenType::ConcatAssign
648 }
649 else {
650 PhpTokenType::Dot
651 }
652 }
653 '(' => {
654 state.advance(1);
655 PhpTokenType::LeftParen
656 }
657 ')' => {
658 state.advance(1);
659 PhpTokenType::RightParen
660 }
661 '[' => {
662 state.advance(1);
663 PhpTokenType::LeftBracket
664 }
665 ']' => {
666 state.advance(1);
667 PhpTokenType::RightBracket
668 }
669 '{' => {
670 state.advance(1);
671 PhpTokenType::LeftBrace
672 }
673 '}' => {
674 state.advance(1);
675 PhpTokenType::RightBrace
676 }
677 '$' => {
678 state.advance(1);
679 PhpTokenType::Dollar
680 }
681 '@' => {
682 state.advance(1);
683 PhpTokenType::At
684 }
685 _ => return false,
686 };
687
688 state.add_token(kind, start_pos, state.get_position());
689 true
690 }
691 else {
692 false
693 }
694 }
695}