mago_type_syntax/
lexer.rs

1use mago_span::Position;
2use mago_span::Span;
3use mago_syntax_core::float_exponent;
4use mago_syntax_core::float_separator;
5use mago_syntax_core::input::Input;
6use mago_syntax_core::number_sign;
7use mago_syntax_core::part_of_identifier;
8use mago_syntax_core::start_of_binary_number;
9use mago_syntax_core::start_of_float_number;
10use mago_syntax_core::start_of_hexadecimal_number;
11use mago_syntax_core::start_of_identifier;
12use mago_syntax_core::start_of_number;
13use mago_syntax_core::start_of_octal_number;
14use mago_syntax_core::start_of_octal_or_float_number;
15use mago_syntax_core::utils::read_digits_of_base;
16
17use crate::error::SyntaxError;
18use crate::token::TypeToken;
19use crate::token::TypeTokenKind;
20
21#[derive(Debug)]
22pub struct TypeLexer<'input> {
23    input: Input<'input>,
24}
25
26impl<'input> TypeLexer<'input> {
27    pub fn new(input: Input<'input>) -> TypeLexer<'input> {
28        TypeLexer { input }
29    }
30
31    pub fn has_reached_eof(&self) -> bool {
32        self.input.has_reached_eof()
33    }
34
35    pub fn current_position(&self) -> Position {
36        self.input.current_position()
37    }
38
39    /// Returns a string slice within a specified absolute range.
40    ///
41    /// This method exposes the underlying `Input::slice_in_range` functionality but
42    /// returns a `&str` instead of a `&[u8]`. It assumes the source is valid UTF-8.
43    ///
44    /// # Arguments
45    ///
46    /// * `from` - The absolute starting byte offset.
47    /// * `to` - The absolute ending byte offset (exclusive).
48    #[inline]
49    pub fn slice_in_range(&self, from: usize, to: usize) -> &'input str {
50        let bytes_slice = self.input.slice_in_range(from, to);
51
52        // Reuse the same safe UTF-8 conversion logic as the `token` method.
53        bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
54    }
55
56    #[inline]
57    pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
58        if self.input.has_reached_eof() {
59            return None;
60        }
61
62        let start = self.input.current_position();
63        let whitespaces = self.input.consume_whitespaces();
64        if !whitespaces.is_empty() {
65            let end = self.input.current_position();
66
67            return self.token(TypeTokenKind::Whitespace, whitespaces, start, end);
68        }
69
70        let (kind, length) = match self.input.read(3) {
71            [b'*', ..] => (TypeTokenKind::Asterisk, 1),
72            [b'n' | b'N', b'o' | b'O', b'n' | b'N'] => {
73                if self.input.is_at(b"non-positive-int", true) {
74                    (TypeTokenKind::NonPositiveInt, 16)
75                } else if self.input.is_at(b"non-negative-int", true) {
76                    (TypeTokenKind::NonNegativeInt, 16)
77                } else if self.input.is_at(b"non-empty-literal-string", true) {
78                    (TypeTokenKind::NonEmptyUnspecifiedLiteralString, 26)
79                } else if self.input.is_at(b"non-empty-string", true) {
80                    (TypeTokenKind::NonEmptyString, 16)
81                } else if self.input.is_at(b"non-empty-array", true) {
82                    (TypeTokenKind::NonEmptyArray, 15)
83                } else if self.input.is_at(b"non-empty-list", true) {
84                    (TypeTokenKind::NonEmptyList, 14)
85                } else if self.input.is_at(b"non-falsy-string", true) {
86                    (TypeTokenKind::NonFalsyString, 16)
87                } else if self.input.is_at(b"non-empty-lowercase-string", true) {
88                    (TypeTokenKind::NonEmptyLowercaseString, 26)
89                } else {
90                    self.read_identifier()
91                }
92            }
93            [b'p' | b'P', b'u' | b'U', b'r' | b'R'] => {
94                if self.input.is_at(b"pure-closure", true) {
95                    (TypeTokenKind::PureClosure, 12)
96                } else if self.input.is_at(b"pure-callable", true) {
97                    (TypeTokenKind::PureCallable, 13)
98                } else {
99                    self.read_identifier()
100                }
101            }
102            [b'n' | b'N', b'e' | b'E', b'v' | b'V'] => {
103                if self.input.is_at(b"never-return", true) {
104                    (TypeTokenKind::NeverReturn, 12)
105                } else if self.input.is_at(b"never-returns", true) {
106                    (TypeTokenKind::NeverReturns, 13)
107                } else {
108                    self.read_identifier()
109                }
110            }
111            [b't' | b'T', b'r' | b'R', b'u' | b'U'] => {
112                if self.input.is_at(b"truthy-string", true) {
113                    (TypeTokenKind::TruthyString, 13)
114                } else {
115                    self.read_identifier()
116                }
117            }
118            [b't' | b'T', b'r' | b'R', b'a' | b'A'] => {
119                if self.input.is_at(b"trait-string", true) {
120                    (TypeTokenKind::TraitString, 12)
121                } else {
122                    self.read_identifier()
123                }
124            }
125            [b'a' | b'A', b's' | b'S', b's' | b'S'] => {
126                if self.input.is_at(b"associative-array", true) {
127                    (TypeTokenKind::AssociativeArray, 17)
128                } else {
129                    self.read_identifier()
130                }
131            }
132            [b'c' | b'C', b'l' | b'L', b'a' | b'A'] => {
133                if self.input.is_at(b"class-string", true) {
134                    (TypeTokenKind::ClassString, 12)
135                } else {
136                    self.read_identifier()
137                }
138            }
139            [b'e' | b'E', b'n' | b'N', b'u' | b'U'] => {
140                if self.input.is_at(b"enum-string", true) {
141                    (TypeTokenKind::EnumString, 11)
142                } else {
143                    self.read_identifier()
144                }
145            }
146            [b'i' | b'I', b'n' | b'N', b't' | b'T'] => {
147                if self.input.is_at(b"interface-string", true) {
148                    (TypeTokenKind::InterfaceString, 16)
149                } else {
150                    self.read_identifier()
151                }
152            }
153            [b'c' | b'C', b'l' | b'L', b'o' | b'O'] => {
154                if self.input.is_at(b"closed-resource", true) {
155                    (TypeTokenKind::ClosedResource, 15)
156                } else {
157                    self.read_identifier()
158                }
159            }
160            [b's' | b'S', b't' | b'T', b'r' | b'R'] => {
161                if self.input.is_at(b"stringable-object", true) {
162                    (TypeTokenKind::StringableObject, 17)
163                } else {
164                    self.read_identifier()
165                }
166            }
167            [b'n' | b'N', b'u' | b'U', b'm' | b'M'] => {
168                if self.input.is_at(b"numeric-string", true) {
169                    (TypeTokenKind::NumericString, 14)
170                } else {
171                    self.read_identifier()
172                }
173            }
174            [b'l' | b'L', b'i' | b'I', b't' | b'T'] => {
175                if self.input.is_at(b"literal-string", true) {
176                    (TypeTokenKind::UnspecifiedLiteralString, 14)
177                } else if self.input.is_at(b"literal-int", true) {
178                    (TypeTokenKind::UnspecifiedLiteralInt, 11)
179                } else {
180                    self.read_identifier()
181                }
182            }
183            [b'l' | b'L', b'o' | b'O', b'w' | b'W'] => {
184                if self.input.is_at(b"lowercase-string", true) {
185                    (TypeTokenKind::LowercaseString, 16)
186                } else {
187                    self.read_identifier()
188                }
189            }
190            [b'o' | b'O', b'p' | b'P', b'e' | b'E'] => {
191                if self.input.is_at(b"open-resource", true) {
192                    (TypeTokenKind::OpenResource, 13)
193                } else {
194                    self.read_identifier()
195                }
196            }
197            [b'a' | b'A', b'r' | b'R', b'r' | b'R'] => {
198                if self.input.is_at(b"array-key", true) {
199                    (TypeTokenKind::ArrayKey, 9)
200                } else {
201                    self.read_identifier()
202                }
203            }
204            [b'n' | b'N', b'o' | b'O', b'-'] => {
205                if self.input.is_at(b"no-return", true) {
206                    (TypeTokenKind::NoReturn, 9)
207                } else {
208                    self.read_identifier()
209                }
210            }
211            [b'v' | b'V', b'a' | b'A', b'l' | b'L'] => {
212                if self.input.is_at(b"value-of", true) {
213                    (TypeTokenKind::ValueOf, 8)
214                } else {
215                    self.read_identifier()
216                }
217            }
218            [b'k' | b'K', b'e' | b'E', b'y' | b'Y'] => {
219                if self.input.is_at(b"key-of", true) {
220                    (TypeTokenKind::KeyOf, 6)
221                } else {
222                    self.read_identifier()
223                }
224            }
225            [b'p' | b'P', b'r' | b'R', b'o' | b'O'] => {
226                if self.input.is_at(b"protected-properties-of", true) {
227                    (TypeTokenKind::ProtectedPropertiesOf, 23)
228                } else if self.input.is_at(b"properties-of", true) {
229                    (TypeTokenKind::PropertiesOf, 13)
230                } else {
231                    self.read_identifier()
232                }
233            }
234            [b'p' | b'P', b'u' | b'U', b'b' | b'B'] => {
235                if self.input.is_at(b"public-properties-of", true) {
236                    (TypeTokenKind::PublicPropertiesOf, 20)
237                } else {
238                    self.read_identifier()
239                }
240            }
241            [b'p' | b'P', b'r' | b'R', b'i' | b'I'] => {
242                if self.input.is_at(b"private-properties-of", true) {
243                    (TypeTokenKind::PrivatePropertiesOf, 21)
244                } else {
245                    self.read_identifier()
246                }
247            }
248            [b'p' | b'P', b'o' | b'O', b's' | b'S'] => {
249                if self.input.is_at(b"positive-int", true) {
250                    (TypeTokenKind::PositiveInt, 12)
251                } else {
252                    self.read_identifier()
253                }
254            }
255            [b'n' | b'N', b'e' | b'E', b'g' | b'G'] => {
256                if self.input.is_at(b"negative-int", true) {
257                    (TypeTokenKind::NegativeInt, 12)
258                } else {
259                    self.read_identifier()
260                }
261            }
262            [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
263            [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
264            [b'/', b'/', ..] => self.read_single_line_comment(),
265            [b'.', start_of_number!(), ..] => self.read_decimal(),
266            [start_of_number!(), ..] => self.read_number(),
267            [quote @ b'\'' | quote @ b'"', ..] => self.read_literal_string(quote),
268            [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
269            [start_of_identifier!(), ..] => self.read_identifier(),
270            [b'$', start_of_identifier!(), ..] => {
271                let mut length = 2;
272                while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
273                    length += 1;
274                }
275
276                (TypeTokenKind::Variable, length)
277            }
278            [b':', ..] => (TypeTokenKind::Colon, 1),
279            [b'=', ..] => (TypeTokenKind::Equals, 1),
280            [b'?', ..] => (TypeTokenKind::Question, 1),
281            [b'&', ..] => (TypeTokenKind::Ampersand, 1),
282            [b'|', ..] => (TypeTokenKind::Pipe, 1),
283            [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
284            [b'<', ..] => (TypeTokenKind::LessThan, 1),
285            [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
286            [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
287            [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
288            [b']', ..] => (TypeTokenKind::RightBracket, 1),
289            [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
290            [b'}', ..] => (TypeTokenKind::RightBrace, 1),
291            [b',', ..] => (TypeTokenKind::Comma, 1),
292            [b'+', ..] => (TypeTokenKind::Plus, 1),
293            [b'-', ..] => (TypeTokenKind::Minus, 1),
294            [unknown_byte, ..] => {
295                return Some(Err(SyntaxError::UnrecognizedToken(*unknown_byte, self.input.current_position())));
296            }
297            [] => {
298                unreachable!()
299            }
300        };
301
302        let buffer = self.input.consume(length);
303        let end = self.input.current_position();
304
305        self.token(kind, buffer, start, end)
306    }
307
308    fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
309        let mut length = 2;
310        loop {
311            match self.input.peek(length, 1) {
312                [b'\n', ..] | [] => {
313                    break;
314                }
315                [_, ..] => {
316                    length += 1;
317                }
318            }
319        }
320
321        (TypeTokenKind::SingleLineComment, length)
322    }
323
324    fn read_decimal(&self) -> (TypeTokenKind, usize) {
325        let mut length = read_digits_of_base(&self.input, 2, 10);
326        if let float_exponent!() = self.input.peek(length, 1) {
327            length += 1;
328            if let number_sign!() = self.input.peek(length, 1) {
329                length += 1;
330            }
331
332            length = read_digits_of_base(&self.input, length, 10);
333        }
334
335        (TypeTokenKind::LiteralFloat, length)
336    }
337
338    fn read_number(&self) -> (TypeTokenKind, usize) {
339        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
340        pub enum NumberKind {
341            Integer,
342            Float,
343            OctalOrFloat,
344            IntegerOrFloat,
345        }
346
347        let mut length = 1;
348
349        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
350            start_of_binary_number!() => {
351                length += 1;
352
353                (2, NumberKind::Integer)
354            }
355            start_of_octal_number!() => {
356                length += 1;
357
358                (8, NumberKind::Integer)
359            }
360            start_of_hexadecimal_number!() => {
361                length += 1;
362
363                (16, NumberKind::Integer)
364            }
365            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
366            start_of_float_number!() => (10, NumberKind::Float),
367            _ => (10, NumberKind::IntegerOrFloat),
368        };
369
370        if kind != NumberKind::Float {
371            length = read_digits_of_base(&self.input, length, base);
372
373            if kind == NumberKind::Integer {
374                return (TypeTokenKind::LiteralInteger, length);
375            }
376        }
377
378        let is_float = matches!(self.input.peek(length, 3), float_separator!());
379
380        if !is_float {
381            return (TypeTokenKind::LiteralInteger, length);
382        }
383
384        if let [b'.'] = self.input.peek(length, 1) {
385            length += 1;
386            length = read_digits_of_base(&self.input, length, 10);
387        }
388
389        if let float_exponent!() = self.input.peek(length, 1) {
390            length += 1;
391            if let number_sign!() = self.input.peek(length, 1) {
392                length += 1;
393            }
394
395            length = read_digits_of_base(&self.input, length, 10);
396        }
397
398        (TypeTokenKind::LiteralFloat, length)
399    }
400
401    fn read_literal_string(&self, quote: &u8) -> (TypeTokenKind, usize) {
402        let total = self.input.len();
403        let start = self.input.current_offset();
404        let mut length = 1; // We assume the opening quote is already consumed.
405        let mut last_was_backslash = false;
406        let mut partial = false;
407
408        loop {
409            let pos = start + length;
410            if pos >= total {
411                // Reached EOF before closing quote.
412                partial = true;
413                break;
414            }
415
416            let byte = self.input.read_at(pos);
417            if matches!(byte, b'\\') {
418                // Toggle the backslash flag.
419                last_was_backslash = !last_was_backslash;
420                length += 1;
421            } else {
422                // If we see the closing quote and the previous byte was not an escape.
423                if byte == quote && !last_was_backslash {
424                    length += 1; // Include the closing quote.
425                    break;
426                }
427
428                length += 1;
429                last_was_backslash = false;
430            }
431        }
432
433        if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
434    }
435
436    fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
437        let mut length = 2;
438        let mut last_was_slash = false;
439        loop {
440            match self.input.peek(length, 1) {
441                [start_of_identifier!(), ..] if last_was_slash => {
442                    length += 1;
443                    last_was_slash = false;
444                }
445                [part_of_identifier!(), ..] if !last_was_slash => {
446                    length += 1;
447                }
448                [b'\\', ..] => {
449                    if last_was_slash {
450                        length -= 1;
451
452                        break;
453                    }
454
455                    length += 1;
456                    last_was_slash = true;
457                }
458                _ => {
459                    break;
460                }
461            }
462        }
463
464        (TypeTokenKind::FullyQualifiedIdentifier, length)
465    }
466
467    fn read_identifier(&self) -> (TypeTokenKind, usize) {
468        const KEYWORD_TYPES: [(&[u8], TypeTokenKind); 28] = [
469            (b"list", TypeTokenKind::List),
470            (b"int", TypeTokenKind::Int),
471            (b"integer", TypeTokenKind::Integer),
472            (b"string", TypeTokenKind::String),
473            (b"float", TypeTokenKind::Float),
474            (b"double", TypeTokenKind::Double),
475            (b"real", TypeTokenKind::Real),
476            (b"bool", TypeTokenKind::Bool),
477            (b"boolean", TypeTokenKind::Boolean),
478            (b"false", TypeTokenKind::False),
479            (b"true", TypeTokenKind::True),
480            (b"object", TypeTokenKind::Object),
481            (b"callable", TypeTokenKind::Callable),
482            (b"array", TypeTokenKind::Array),
483            (b"iterable", TypeTokenKind::Iterable),
484            (b"null", TypeTokenKind::Null),
485            (b"mixed", TypeTokenKind::Mixed),
486            (b"resource", TypeTokenKind::Resource),
487            (b"void", TypeTokenKind::Void),
488            (b"scalar", TypeTokenKind::Scalar),
489            (b"numeric", TypeTokenKind::Numeric),
490            (b"never", TypeTokenKind::Never),
491            (b"nothing", TypeTokenKind::Nothing),
492            (b"as", TypeTokenKind::As),
493            (b"is", TypeTokenKind::Is),
494            (b"not", TypeTokenKind::Not),
495            (b"min", TypeTokenKind::Min),
496            (b"max", TypeTokenKind::Max),
497        ];
498
499        let mut length = 1;
500        let mut ended_with_slash = false;
501        loop {
502            match self.input.peek(length, 2) {
503                [part_of_identifier!(), ..] => {
504                    length += 1;
505                }
506                [b'\\', start_of_identifier!(), ..] => {
507                    ended_with_slash = true;
508                    break;
509                }
510                _ => {
511                    break;
512                }
513            }
514        }
515
516        if !ended_with_slash {
517            for (value, kind) in KEYWORD_TYPES {
518                let keyword_length = value.len();
519                if keyword_length != length {
520                    continue;
521                }
522
523                if self.input.is_at(value, true) {
524                    return (kind, keyword_length);
525                }
526            }
527        }
528
529        let mut slashes = 0;
530        let mut last_was_slash = false;
531        loop {
532            match self.input.peek(length, 1) {
533                [start_of_identifier!(), ..] if last_was_slash => {
534                    length += 1;
535                    last_was_slash = false;
536                }
537                [part_of_identifier!(), ..] if !last_was_slash => {
538                    length += 1;
539                }
540                [b'\\', ..] => {
541                    if !last_was_slash {
542                        length += 1;
543                        slashes += 1;
544                        last_was_slash = true;
545                    } else {
546                        length -= 1;
547                        slashes -= 1;
548                        last_was_slash = false;
549
550                        break;
551                    }
552                }
553                _ => {
554                    break;
555                }
556            }
557        }
558
559        if last_was_slash {
560            length -= 1;
561            slashes -= 1;
562        }
563
564        if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
565    }
566
567    #[inline]
568    fn token(
569        &self,
570        kind: TypeTokenKind,
571        value: &'input [u8],
572        from: Position,
573        to: Position,
574    ) -> Option<Result<TypeToken<'input>, SyntaxError>> {
575        let mut value_chunks = value.utf8_chunks();
576        let value_str = if let Some(chunk) = value_chunks.next() {
577            let valid = chunk.valid();
578
579            debug_assert_eq!(valid.len(), value.len());
580
581            valid
582        } else {
583            ""
584        };
585
586        Some(Ok(TypeToken { kind, value: value_str, span: Span::new(from, to) }))
587    }
588}