mago_type_syntax/
lexer.rs

1use mago_span::Position;
2use mago_span::Span;
3use mago_syntax_core::float_exponent;
4use mago_syntax_core::float_separator;
5use mago_syntax_core::input::Input;
6use mago_syntax_core::number_sign;
7use mago_syntax_core::part_of_identifier;
8use mago_syntax_core::start_of_binary_number;
9use mago_syntax_core::start_of_float_number;
10use mago_syntax_core::start_of_hexadecimal_number;
11use mago_syntax_core::start_of_identifier;
12use mago_syntax_core::start_of_number;
13use mago_syntax_core::start_of_octal_number;
14use mago_syntax_core::start_of_octal_or_float_number;
15use mago_syntax_core::utils::read_digits_of_base;
16
17use crate::error::SyntaxError;
18use crate::token::TypeToken;
19use crate::token::TypeTokenKind;
20
21#[derive(Debug)]
22pub struct TypeLexer<'input> {
23    input: Input<'input>,
24}
25
26impl<'input> TypeLexer<'input> {
27    pub fn new(input: Input<'input>) -> TypeLexer<'input> {
28        TypeLexer { input }
29    }
30
31    pub fn has_reached_eof(&self) -> bool {
32        self.input.has_reached_eof()
33    }
34
35    pub fn current_position(&self) -> Position {
36        self.input.current_position()
37    }
38
39    #[inline]
40    pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
41        if self.input.has_reached_eof() {
42            return None;
43        }
44
45        let start = self.input.current_position();
46        let whitespaces = self.input.consume_whitespaces();
47        if !whitespaces.is_empty() {
48            let end = self.input.current_position();
49
50            return self.token(TypeTokenKind::Whitespace, whitespaces, start, end);
51        }
52
53        let (kind, length) = match self.input.read(3) {
54            [b'n' | b'N', b'o' | b'O', b'n' | b'N'] => {
55                if self.input.is_at(b"non-empty-literal-string", true) {
56                    (TypeTokenKind::NonEmptyUnspecifiedLiteralString, 26)
57                } else if self.input.is_at(b"non-empty-string", true) {
58                    (TypeTokenKind::NonEmptyString, 16)
59                } else if self.input.is_at(b"non-empty-array", true) {
60                    (TypeTokenKind::NonEmptyArray, 15)
61                } else if self.input.is_at(b"non-empty-list", true) {
62                    (TypeTokenKind::NonEmptyList, 14)
63                } else {
64                    self.read_identifier()
65                }
66            }
67            [b'p' | b'P', b'u' | b'U', b'r' | b'R'] => {
68                if self.input.is_at(b"pure-closure", true) {
69                    (TypeTokenKind::PureClosure, 12)
70                } else if self.input.is_at(b"pure-callable", true) {
71                    (TypeTokenKind::PureCallable, 13)
72                } else {
73                    self.read_identifier()
74                }
75            }
76            [b'n' | b'N', b'e' | b'E', b'v' | b'V'] => {
77                if self.input.is_at(b"never-return", true) {
78                    (TypeTokenKind::NeverReturn, 12)
79                } else if self.input.is_at(b"never-returns", true) {
80                    (TypeTokenKind::NeverReturns, 13)
81                } else {
82                    self.read_identifier()
83                }
84            }
85            [b't' | b'T', b'r' | b'R', b'u' | b'U'] => {
86                if self.input.is_at(b"truthy-string", true) {
87                    (TypeTokenKind::TruthyString, 13)
88                } else {
89                    self.read_identifier()
90                }
91            }
92            [b't' | b'T', b'r' | b'R', b'a' | b'A'] => {
93                if self.input.is_at(b"trait-string", true) {
94                    (TypeTokenKind::TraitString, 12)
95                } else {
96                    self.read_identifier()
97                }
98            }
99            [b'a' | b'A', b's' | b'S', b's' | b'S'] => {
100                if self.input.is_at(b"associative-array", true) {
101                    (TypeTokenKind::AssociativeArray, 17)
102                } else {
103                    self.read_identifier()
104                }
105            }
106            [b'c' | b'C', b'l' | b'L', b'a' | b'A'] => {
107                if self.input.is_at(b"class-string", true) {
108                    (TypeTokenKind::ClassString, 12)
109                } else {
110                    self.read_identifier()
111                }
112            }
113            [b'e' | b'E', b'n' | b'N', b'u' | b'U'] => {
114                if self.input.is_at(b"enum-string", true) {
115                    (TypeTokenKind::EnumString, 11)
116                } else {
117                    self.read_identifier()
118                }
119            }
120            [b'i' | b'I', b'n' | b'N', b't' | b'T'] => {
121                if self.input.is_at(b"interface-string", true) {
122                    (TypeTokenKind::InterfaceString, 16)
123                } else {
124                    self.read_identifier()
125                }
126            }
127            [b'c' | b'C', b'l' | b'L', b'o' | b'O'] => {
128                if self.input.is_at(b"closed-resource", true) {
129                    (TypeTokenKind::ClosedResource, 15)
130                } else {
131                    self.read_identifier()
132                }
133            }
134            [b's' | b'S', b't' | b'T', b'r' | b'R'] => {
135                if self.input.is_at(b"stringable-object", true) {
136                    (TypeTokenKind::StringableObject, 17)
137                } else {
138                    self.read_identifier()
139                }
140            }
141            [b'n' | b'N', b'u' | b'U', b'm' | b'M'] => {
142                if self.input.is_at(b"numeric-string", true) {
143                    (TypeTokenKind::NumericString, 14)
144                } else {
145                    self.read_identifier()
146                }
147            }
148            [b'l' | b'L', b'i' | b'I', b't' | b'T'] => {
149                if self.input.is_at(b"literal-string", true) {
150                    (TypeTokenKind::UnspecifiedLiteralString, 14)
151                } else if self.input.is_at(b"literal-int", true) {
152                    (TypeTokenKind::UnspecifiedLiteralInt, 11)
153                } else {
154                    self.read_identifier()
155                }
156            }
157            [b'l' | b'L', b'o' | b'O', b'w' | b'W'] => {
158                if self.input.is_at(b"lowercase-string", true) {
159                    (TypeTokenKind::LowercaseString, 15)
160                } else {
161                    self.read_identifier()
162                }
163            }
164            [b'o' | b'O', b'p' | b'P', b'e' | b'E'] => {
165                if self.input.is_at(b"open-resource", true) {
166                    (TypeTokenKind::OpenResource, 13)
167                } else {
168                    self.read_identifier()
169                }
170            }
171            [b'a' | b'A', b'r' | b'R', b'r' | b'R'] => {
172                if self.input.is_at(b"array-key", true) {
173                    (TypeTokenKind::ArrayKey, 9)
174                } else {
175                    self.read_identifier()
176                }
177            }
178            [b'n' | b'N', b'o' | b'O', b'-'] => {
179                if self.input.is_at(b"no-return", true) {
180                    (TypeTokenKind::NoReturn, 9)
181                } else {
182                    self.read_identifier()
183                }
184            }
185            [b'v' | b'V', b'a' | b'A', b'l' | b'L'] => {
186                if self.input.is_at(b"value-of", true) {
187                    (TypeTokenKind::ValueOf, 8)
188                } else {
189                    self.read_identifier()
190                }
191            }
192            [b'k' | b'K', b'e' | b'E', b'y' | b'Y'] => {
193                if self.input.is_at(b"key-of", true) {
194                    (TypeTokenKind::KeyOf, 6)
195                } else {
196                    self.read_identifier()
197                }
198            }
199            [b'p' | b'P', b'r' | b'R', b'o' | b'O'] => {
200                if self.input.is_at(b"protected-properties-of", true) {
201                    (TypeTokenKind::ProtectedPropertiesOf, 23)
202                } else if self.input.is_at(b"properties-of", true) {
203                    (TypeTokenKind::PropertiesOf, 13)
204                } else {
205                    self.read_identifier()
206                }
207            }
208            [b'p' | b'P', b'u' | b'U', b'b' | b'B'] => {
209                if self.input.is_at(b"public-properties-of", true) {
210                    (TypeTokenKind::PublicPropertiesOf, 20)
211                } else {
212                    self.read_identifier()
213                }
214            }
215            [b'p' | b'P', b'r' | b'R', b'i' | b'I'] => {
216                if self.input.is_at(b"private-properties-of", true) {
217                    (TypeTokenKind::PrivatePropertiesOf, 21)
218                } else {
219                    self.read_identifier()
220                }
221            }
222            [b'p' | b'P', b'o' | b'O', b's' | b'S'] => {
223                if self.input.is_at(b"positive-int", true) {
224                    (TypeTokenKind::PositiveInt, 12)
225                } else {
226                    self.read_identifier()
227                }
228            }
229            [b'n' | b'N', b'e' | b'E', b'g' | b'G'] => {
230                if self.input.is_at(b"negative-int", true) {
231                    (TypeTokenKind::NegativeInt, 12)
232                } else {
233                    self.read_identifier()
234                }
235            }
236            [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
237            [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
238            [b'/', b'/', ..] => self.read_single_line_comment(),
239            [b'.', start_of_number!(), ..] => self.read_decimal(),
240            [start_of_number!(), ..] => self.read_number(),
241            [quote @ b'\'' | quote @ b'"', ..] => self.read_literal_string(quote),
242            [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
243            [start_of_identifier!(), ..] => self.read_identifier(),
244            [b'$', start_of_identifier!(), ..] => {
245                let mut length = 2;
246                while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
247                    length += 1;
248                }
249
250                (TypeTokenKind::Variable, length)
251            }
252            [b':', ..] => (TypeTokenKind::Colon, 1),
253            [b'=', ..] => (TypeTokenKind::Equals, 1),
254            [b'?', ..] => (TypeTokenKind::Question, 1),
255            [b'&', ..] => (TypeTokenKind::Ampersand, 1),
256            [b'|', ..] => (TypeTokenKind::Pipe, 1),
257            [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
258            [b'<', ..] => (TypeTokenKind::LessThan, 1),
259            [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
260            [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
261            [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
262            [b']', ..] => (TypeTokenKind::RightBracket, 1),
263            [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
264            [b'}', ..] => (TypeTokenKind::RightBrace, 1),
265            [b',', ..] => (TypeTokenKind::Comma, 1),
266            [b'+', ..] => (TypeTokenKind::Plus, 1),
267            [b'-', ..] => (TypeTokenKind::Minus, 1),
268            [unknown_byte, ..] => {
269                return Some(Err(SyntaxError::UnrecognizedToken(*unknown_byte, self.input.current_position())));
270            }
271            [] => {
272                unreachable!()
273            }
274        };
275
276        let buffer = self.input.consume(length);
277        let end = self.input.current_position();
278
279        self.token(kind, buffer, start, end)
280    }
281
282    fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
283        let mut length = 2;
284        loop {
285            match self.input.peek(length, 1) {
286                [b'\n', ..] | [] => {
287                    break;
288                }
289                [_, ..] => {
290                    length += 1;
291                }
292            }
293        }
294
295        (TypeTokenKind::SingleLineComment, length)
296    }
297
298    fn read_decimal(&self) -> (TypeTokenKind, usize) {
299        let mut length = read_digits_of_base(&self.input, 2, 10);
300        if let float_exponent!() = self.input.peek(length, 1) {
301            length += 1;
302            if let number_sign!() = self.input.peek(length, 1) {
303                length += 1;
304            }
305
306            length = read_digits_of_base(&self.input, length, 10);
307        }
308
309        (TypeTokenKind::LiteralFloat, length)
310    }
311
312    fn read_number(&self) -> (TypeTokenKind, usize) {
313        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
314        pub enum NumberKind {
315            Integer,
316            Float,
317            OctalOrFloat,
318            IntegerOrFloat,
319        }
320
321        let mut length = 1;
322
323        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
324            start_of_binary_number!() => {
325                length += 1;
326
327                (2, NumberKind::Integer)
328            }
329            start_of_octal_number!() => {
330                length += 1;
331
332                (8, NumberKind::Integer)
333            }
334            start_of_hexadecimal_number!() => {
335                length += 1;
336
337                (16, NumberKind::Integer)
338            }
339            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
340            start_of_float_number!() => (10, NumberKind::Float),
341            _ => (10, NumberKind::IntegerOrFloat),
342        };
343
344        if kind != NumberKind::Float {
345            length = read_digits_of_base(&self.input, length, base);
346
347            if kind == NumberKind::Integer {
348                return (TypeTokenKind::LiteralInteger, length);
349            }
350        }
351
352        let is_float = matches!(self.input.peek(length, 3), float_separator!());
353
354        if !is_float {
355            return (TypeTokenKind::LiteralInteger, length);
356        }
357
358        if let [b'.'] = self.input.peek(length, 1) {
359            length += 1;
360            length = read_digits_of_base(&self.input, length, 10);
361        }
362
363        if let float_exponent!() = self.input.peek(length, 1) {
364            length += 1;
365            if let number_sign!() = self.input.peek(length, 1) {
366                length += 1;
367            }
368
369            length = read_digits_of_base(&self.input, length, 10);
370        }
371
372        (TypeTokenKind::LiteralFloat, length)
373    }
374
375    fn read_literal_string(&self, quote: &u8) -> (TypeTokenKind, usize) {
376        let total = self.input.len();
377        let start = self.input.current_offset();
378        let mut length = 1; // We assume the opening quote is already consumed.
379        let mut last_was_backslash = false;
380        let mut partial = false;
381
382        loop {
383            let pos = start + length;
384            if pos >= total {
385                // Reached EOF before closing quote.
386                partial = true;
387                break;
388            }
389
390            let byte = self.input.read_at(pos);
391            if matches!(byte, b'\\') {
392                // Toggle the backslash flag.
393                last_was_backslash = !last_was_backslash;
394                length += 1;
395            } else {
396                // If we see the closing quote and the previous byte was not an escape.
397                if byte == quote && !last_was_backslash {
398                    length += 1; // Include the closing quote.
399                    break;
400                }
401
402                length += 1;
403                last_was_backslash = false;
404            }
405        }
406
407        if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
408    }
409
410    fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
411        let mut length = 2;
412        let mut last_was_slash = false;
413        loop {
414            match self.input.peek(length, 1) {
415                [start_of_identifier!(), ..] if last_was_slash => {
416                    length += 1;
417                    last_was_slash = false;
418                }
419                [part_of_identifier!(), ..] if !last_was_slash => {
420                    length += 1;
421                }
422                [b'\\', ..] => {
423                    if last_was_slash {
424                        length -= 1;
425
426                        break;
427                    }
428
429                    length += 1;
430                    last_was_slash = true;
431                }
432                _ => {
433                    break;
434                }
435            }
436        }
437
438        (TypeTokenKind::FullyQualifiedIdentifier, length)
439    }
440
441    fn read_identifier(&self) -> (TypeTokenKind, usize) {
442        const KEYWORD_TYPES: [(&[u8], TypeTokenKind); 24] = [
443            (b"list", TypeTokenKind::List),
444            (b"int", TypeTokenKind::Int),
445            (b"string", TypeTokenKind::String),
446            (b"float", TypeTokenKind::Float),
447            (b"bool", TypeTokenKind::Bool),
448            (b"false", TypeTokenKind::False),
449            (b"true", TypeTokenKind::True),
450            (b"object", TypeTokenKind::Object),
451            (b"callable", TypeTokenKind::Callable),
452            (b"array", TypeTokenKind::Array),
453            (b"iterable", TypeTokenKind::Iterable),
454            (b"null", TypeTokenKind::Null),
455            (b"mixed", TypeTokenKind::Mixed),
456            (b"resource", TypeTokenKind::Resource),
457            (b"void", TypeTokenKind::Void),
458            (b"scalar", TypeTokenKind::Scalar),
459            (b"numeric", TypeTokenKind::Numeric),
460            (b"never", TypeTokenKind::Never),
461            (b"nothing", TypeTokenKind::Nothing),
462            (b"as", TypeTokenKind::As),
463            (b"is", TypeTokenKind::Is),
464            (b"not", TypeTokenKind::Not),
465            (b"min", TypeTokenKind::Min),
466            (b"max", TypeTokenKind::Max),
467        ];
468
469        let mut length = 1;
470        let mut ended_with_slash = false;
471        loop {
472            match self.input.peek(length, 2) {
473                [part_of_identifier!(), ..] => {
474                    length += 1;
475                }
476                [b'\\', start_of_identifier!(), ..] => {
477                    ended_with_slash = true;
478                    break;
479                }
480                _ => {
481                    break;
482                }
483            }
484        }
485
486        if !ended_with_slash {
487            for (value, kind) in KEYWORD_TYPES {
488                let keyword_length = value.len();
489                if keyword_length != length {
490                    continue;
491                }
492
493                if self.input.is_at(value, true) {
494                    return (kind, keyword_length);
495                }
496            }
497        }
498
499        let mut slashes = 0;
500        let mut last_was_slash = false;
501        loop {
502            match self.input.peek(length, 1) {
503                [start_of_identifier!(), ..] if last_was_slash => {
504                    length += 1;
505                    last_was_slash = false;
506                }
507                [part_of_identifier!(), ..] if !last_was_slash => {
508                    length += 1;
509                }
510                [b'\\', ..] => {
511                    if !last_was_slash {
512                        length += 1;
513                        slashes += 1;
514                        last_was_slash = true;
515                    } else {
516                        length -= 1;
517                        slashes -= 1;
518                        last_was_slash = false;
519
520                        break;
521                    }
522                }
523                _ => {
524                    break;
525                }
526            }
527        }
528
529        if last_was_slash {
530            length -= 1;
531            slashes -= 1;
532        }
533
534        if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
535    }
536
537    #[inline]
538    fn token(
539        &self,
540        kind: TypeTokenKind,
541        value: &'input [u8],
542        from: Position,
543        to: Position,
544    ) -> Option<Result<TypeToken<'input>, SyntaxError>> {
545        let mut value_chunks = value.utf8_chunks();
546        let value_str = if let Some(chunk) = value_chunks.next() {
547            let valid = chunk.valid();
548
549            debug_assert_eq!(valid.len(), value.len());
550
551            valid
552        } else {
553            ""
554        };
555
556        Some(Ok(TypeToken { kind, value: value_str, span: Span::new(from, to) }))
557    }
558}