1use mago_database::file::FileId;
2use mago_database::file::HasFileId;
3use mago_span::Position;
4use mago_span::Span;
5use mago_syntax_core::float_exponent;
6use mago_syntax_core::float_separator;
7use mago_syntax_core::input::Input;
8use mago_syntax_core::number_sign;
9use mago_syntax_core::part_of_identifier;
10use mago_syntax_core::start_of_binary_number;
11use mago_syntax_core::start_of_float_number;
12use mago_syntax_core::start_of_hexadecimal_number;
13use mago_syntax_core::start_of_identifier;
14use mago_syntax_core::start_of_number;
15use mago_syntax_core::start_of_octal_number;
16use mago_syntax_core::start_of_octal_or_float_number;
17use mago_syntax_core::utils::read_digits_of_base;
18
19use crate::error::SyntaxError;
20use crate::token::TypeToken;
21use crate::token::TypeTokenKind;
22
23#[derive(Debug)]
24pub struct TypeLexer<'input> {
25 input: Input<'input>,
26}
27
28impl<'input> TypeLexer<'input> {
29 pub fn new(input: Input<'input>) -> TypeLexer<'input> {
30 TypeLexer { input }
31 }
32
33 pub fn has_reached_eof(&self) -> bool {
34 self.input.has_reached_eof()
35 }
36
37 pub fn current_position(&self) -> Position {
38 self.input.current_position()
39 }
40
41 #[inline]
51 pub fn slice_in_range(&self, from: u32, to: u32) -> &'input str {
52 let bytes_slice = self.input.slice_in_range(from, to);
53
54 bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
56 }
57
58 #[inline]
59 pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
60 if self.input.has_reached_eof() {
61 return None;
62 }
63
64 let start = self.input.current_position();
65 let whitespaces = self.input.consume_whitespaces();
66 if !whitespaces.is_empty() {
67 let end = self.input.current_position();
68
69 return self.token(TypeTokenKind::Whitespace, whitespaces, start, end);
70 }
71
72 let (kind, length) = match self.input.read(3) {
73 [b'*', ..] => (TypeTokenKind::Asterisk, 1),
74 [b'n' | b'N', b'o' | b'O', b'n' | b'N'] => {
75 if self.input.is_at(b"non-positive-int", true) {
76 (TypeTokenKind::NonPositiveInt, 16)
77 } else if self.input.is_at(b"non-negative-int", true) {
78 (TypeTokenKind::NonNegativeInt, 16)
79 } else if self.input.is_at(b"non-empty-literal-string", true) {
80 (TypeTokenKind::NonEmptyUnspecifiedLiteralString, 26)
81 } else if self.input.is_at(b"non-empty-string", true) {
82 (TypeTokenKind::NonEmptyString, 16)
83 } else if self.input.is_at(b"non-empty-array", true) {
84 (TypeTokenKind::NonEmptyArray, 15)
85 } else if self.input.is_at(b"non-empty-list", true) {
86 (TypeTokenKind::NonEmptyList, 14)
87 } else if self.input.is_at(b"non-falsy-string", true) {
88 (TypeTokenKind::NonFalsyString, 16)
89 } else if self.input.is_at(b"non-empty-lowercase-string", true) {
90 (TypeTokenKind::NonEmptyLowercaseString, 26)
91 } else {
92 self.read_identifier()
93 }
94 }
95 [b'p' | b'P', b'u' | b'U', b'r' | b'R'] => {
96 if self.input.is_at(b"pure-closure", true) {
97 (TypeTokenKind::PureClosure, 12)
98 } else if self.input.is_at(b"pure-callable", true) {
99 (TypeTokenKind::PureCallable, 13)
100 } else {
101 self.read_identifier()
102 }
103 }
104 [b'n' | b'N', b'e' | b'E', b'v' | b'V'] => {
105 if self.input.is_at(b"never-return", true) {
106 (TypeTokenKind::NeverReturn, 12)
107 } else if self.input.is_at(b"never-returns", true) {
108 (TypeTokenKind::NeverReturns, 13)
109 } else {
110 self.read_identifier()
111 }
112 }
113 [b't' | b'T', b'r' | b'R', b'u' | b'U'] => {
114 if self.input.is_at(b"truthy-string", true) {
115 (TypeTokenKind::TruthyString, 13)
116 } else {
117 self.read_identifier()
118 }
119 }
120 [b't' | b'T', b'r' | b'R', b'a' | b'A'] => {
121 if self.input.is_at(b"trait-string", true) {
122 (TypeTokenKind::TraitString, 12)
123 } else {
124 self.read_identifier()
125 }
126 }
127 [b'a' | b'A', b's' | b'S', b's' | b'S'] => {
128 if self.input.is_at(b"associative-array", true) {
129 (TypeTokenKind::AssociativeArray, 17)
130 } else {
131 self.read_identifier()
132 }
133 }
134 [b'c' | b'C', b'l' | b'L', b'a' | b'A'] => {
135 if self.input.is_at(b"class-string", true) {
136 (TypeTokenKind::ClassString, 12)
137 } else {
138 self.read_identifier()
139 }
140 }
141 [b'e' | b'E', b'n' | b'N', b'u' | b'U'] => {
142 if self.input.is_at(b"enum-string", true) {
143 (TypeTokenKind::EnumString, 11)
144 } else {
145 self.read_identifier()
146 }
147 }
148 [b'i' | b'I', b'n' | b'N', b't' | b'T'] => {
149 if self.input.is_at(b"interface-string", true) {
150 (TypeTokenKind::InterfaceString, 16)
151 } else {
152 self.read_identifier()
153 }
154 }
155 [b'c' | b'C', b'l' | b'L', b'o' | b'O'] => {
156 if self.input.is_at(b"closed-resource", true) {
157 (TypeTokenKind::ClosedResource, 15)
158 } else {
159 self.read_identifier()
160 }
161 }
162 [b's' | b'S', b't' | b'T', b'r' | b'R'] => {
163 if self.input.is_at(b"stringable-object", true) {
164 (TypeTokenKind::StringableObject, 17)
165 } else {
166 self.read_identifier()
167 }
168 }
169 [b'n' | b'N', b'u' | b'U', b'm' | b'M'] => {
170 if self.input.is_at(b"numeric-string", true) {
171 (TypeTokenKind::NumericString, 14)
172 } else {
173 self.read_identifier()
174 }
175 }
176 [b'l' | b'L', b'i' | b'I', b't' | b'T'] => {
177 if self.input.is_at(b"literal-string", true) {
178 (TypeTokenKind::UnspecifiedLiteralString, 14)
179 } else if self.input.is_at(b"literal-int", true) {
180 (TypeTokenKind::UnspecifiedLiteralInt, 11)
181 } else {
182 self.read_identifier()
183 }
184 }
185 [b'l' | b'L', b'o' | b'O', b'w' | b'W'] => {
186 if self.input.is_at(b"lowercase-string", true) {
187 (TypeTokenKind::LowercaseString, 16)
188 } else {
189 self.read_identifier()
190 }
191 }
192 [b'o' | b'O', b'p' | b'P', b'e' | b'E'] => {
193 if self.input.is_at(b"open-resource", true) {
194 (TypeTokenKind::OpenResource, 13)
195 } else {
196 self.read_identifier()
197 }
198 }
199 [b'a' | b'A', b'r' | b'R', b'r' | b'R'] => {
200 if self.input.is_at(b"array-key", true) {
201 (TypeTokenKind::ArrayKey, 9)
202 } else {
203 self.read_identifier()
204 }
205 }
206 [b'n' | b'N', b'o' | b'O', b'-'] => {
207 if self.input.is_at(b"no-return", true) {
208 (TypeTokenKind::NoReturn, 9)
209 } else {
210 self.read_identifier()
211 }
212 }
213 [b'v' | b'V', b'a' | b'A', b'l' | b'L'] => {
214 if self.input.is_at(b"value-of", true) {
215 (TypeTokenKind::ValueOf, 8)
216 } else {
217 self.read_identifier()
218 }
219 }
220 [b'k' | b'K', b'e' | b'E', b'y' | b'Y'] => {
221 if self.input.is_at(b"key-of", true) {
222 (TypeTokenKind::KeyOf, 6)
223 } else {
224 self.read_identifier()
225 }
226 }
227 [b'p' | b'P', b'r' | b'R', b'o' | b'O'] => {
228 if self.input.is_at(b"protected-properties-of", true) {
229 (TypeTokenKind::ProtectedPropertiesOf, 23)
230 } else if self.input.is_at(b"properties-of", true) {
231 (TypeTokenKind::PropertiesOf, 13)
232 } else {
233 self.read_identifier()
234 }
235 }
236 [b'p' | b'P', b'u' | b'U', b'b' | b'B'] => {
237 if self.input.is_at(b"public-properties-of", true) {
238 (TypeTokenKind::PublicPropertiesOf, 20)
239 } else {
240 self.read_identifier()
241 }
242 }
243 [b'p' | b'P', b'r' | b'R', b'i' | b'I'] => {
244 if self.input.is_at(b"private-properties-of", true) {
245 (TypeTokenKind::PrivatePropertiesOf, 21)
246 } else {
247 self.read_identifier()
248 }
249 }
250 [b'p' | b'P', b'o' | b'O', b's' | b'S'] => {
251 if self.input.is_at(b"positive-int", true) {
252 (TypeTokenKind::PositiveInt, 12)
253 } else {
254 self.read_identifier()
255 }
256 }
257 [b'n' | b'N', b'e' | b'E', b'g' | b'G'] => {
258 if self.input.is_at(b"negative-int", true) {
259 (TypeTokenKind::NegativeInt, 12)
260 } else {
261 self.read_identifier()
262 }
263 }
264 [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
265 [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
266 [b'/', b'/', ..] => self.read_single_line_comment(),
267 [b'.', start_of_number!(), ..] => self.read_decimal(),
268 [start_of_number!(), ..] => self.read_number(),
269 [quote @ b'\'' | quote @ b'"', ..] => self.read_literal_string(quote),
270 [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
271 [start_of_identifier!(), ..] => self.read_identifier(),
272 [b'$', start_of_identifier!(), ..] => {
273 let mut length = 2;
274 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
275 length += 1;
276 }
277
278 (TypeTokenKind::Variable, length)
279 }
280 [b':', ..] => (TypeTokenKind::Colon, 1),
281 [b'=', ..] => (TypeTokenKind::Equals, 1),
282 [b'?', ..] => (TypeTokenKind::Question, 1),
283 [b'&', ..] => (TypeTokenKind::Ampersand, 1),
284 [b'|', ..] => (TypeTokenKind::Pipe, 1),
285 [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
286 [b'<', ..] => (TypeTokenKind::LessThan, 1),
287 [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
288 [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
289 [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
290 [b']', ..] => (TypeTokenKind::RightBracket, 1),
291 [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
292 [b'}', ..] => (TypeTokenKind::RightBrace, 1),
293 [b',', ..] => (TypeTokenKind::Comma, 1),
294 [b'+', ..] => (TypeTokenKind::Plus, 1),
295 [b'-', ..] => (TypeTokenKind::Minus, 1),
296 [unknown_byte, ..] => {
297 return Some(Err(SyntaxError::UnrecognizedToken(
298 self.file_id(),
299 *unknown_byte,
300 self.input.current_position(),
301 )));
302 }
303 [] => {
304 unreachable!()
305 }
306 };
307
308 let buffer = self.input.consume(length);
309 let end = self.input.current_position();
310
311 self.token(kind, buffer, start, end)
312 }
313
314 fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
315 let mut length = 2;
316 loop {
317 match self.input.peek(length, 1) {
318 [b'\n', ..] | [] => {
319 break;
320 }
321 [_, ..] => {
322 length += 1;
323 }
324 }
325 }
326
327 (TypeTokenKind::SingleLineComment, length)
328 }
329
330 fn read_decimal(&self) -> (TypeTokenKind, usize) {
331 let mut length = read_digits_of_base(&self.input, 2, 10);
332 if let float_exponent!() = self.input.peek(length, 1) {
333 length += 1;
334 if let number_sign!() = self.input.peek(length, 1) {
335 length += 1;
336 }
337
338 length = read_digits_of_base(&self.input, length, 10);
339 }
340
341 (TypeTokenKind::LiteralFloat, length)
342 }
343
344 fn read_number(&self) -> (TypeTokenKind, usize) {
345 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
346 pub enum NumberKind {
347 Integer,
348 Float,
349 OctalOrFloat,
350 IntegerOrFloat,
351 }
352
353 let mut length = 1;
354
355 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
356 start_of_binary_number!() => {
357 length += 1;
358
359 (2, NumberKind::Integer)
360 }
361 start_of_octal_number!() => {
362 length += 1;
363
364 (8, NumberKind::Integer)
365 }
366 start_of_hexadecimal_number!() => {
367 length += 1;
368
369 (16, NumberKind::Integer)
370 }
371 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
372 start_of_float_number!() => (10, NumberKind::Float),
373 _ => (10, NumberKind::IntegerOrFloat),
374 };
375
376 if kind != NumberKind::Float {
377 length = read_digits_of_base(&self.input, length, base);
378
379 if kind == NumberKind::Integer {
380 return (TypeTokenKind::LiteralInteger, length);
381 }
382 }
383
384 let is_float = matches!(self.input.peek(length, 3), float_separator!());
385
386 if !is_float {
387 return (TypeTokenKind::LiteralInteger, length);
388 }
389
390 if let [b'.'] = self.input.peek(length, 1) {
391 length += 1;
392 length = read_digits_of_base(&self.input, length, 10);
393 }
394
395 if let float_exponent!() = self.input.peek(length, 1) {
396 length += 1;
397 if let number_sign!() = self.input.peek(length, 1) {
398 length += 1;
399 }
400
401 length = read_digits_of_base(&self.input, length, 10);
402 }
403
404 (TypeTokenKind::LiteralFloat, length)
405 }
406
407 fn read_literal_string(&self, quote: &u8) -> (TypeTokenKind, usize) {
408 let total = self.input.len();
409 let start = self.input.current_offset();
410 let mut length = 1; let mut last_was_backslash = false;
412 let mut partial = false;
413
414 loop {
415 let pos = start + length;
416 if pos >= total {
417 partial = true;
419 break;
420 }
421
422 let byte = self.input.read_at(pos);
423 if matches!(byte, b'\\') {
424 last_was_backslash = !last_was_backslash;
426 length += 1;
427 } else {
428 if byte == quote && !last_was_backslash {
430 length += 1; break;
432 }
433
434 length += 1;
435 last_was_backslash = false;
436 }
437 }
438
439 if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
440 }
441
442 fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
443 let mut length = 2;
444 let mut last_was_slash = false;
445 loop {
446 match self.input.peek(length, 1) {
447 [start_of_identifier!(), ..] if last_was_slash => {
448 length += 1;
449 last_was_slash = false;
450 }
451 [part_of_identifier!(), ..] if !last_was_slash => {
452 length += 1;
453 }
454 [b'\\', ..] => {
455 if last_was_slash {
456 length -= 1;
457
458 break;
459 }
460
461 length += 1;
462 last_was_slash = true;
463 }
464 _ => {
465 break;
466 }
467 }
468 }
469
470 (TypeTokenKind::FullyQualifiedIdentifier, length)
471 }
472
473 fn read_identifier(&self) -> (TypeTokenKind, usize) {
474 const KEYWORD_TYPES: [(&[u8], TypeTokenKind); 28] = [
475 (b"list", TypeTokenKind::List),
476 (b"int", TypeTokenKind::Int),
477 (b"integer", TypeTokenKind::Integer),
478 (b"string", TypeTokenKind::String),
479 (b"float", TypeTokenKind::Float),
480 (b"double", TypeTokenKind::Double),
481 (b"real", TypeTokenKind::Real),
482 (b"bool", TypeTokenKind::Bool),
483 (b"boolean", TypeTokenKind::Boolean),
484 (b"false", TypeTokenKind::False),
485 (b"true", TypeTokenKind::True),
486 (b"object", TypeTokenKind::Object),
487 (b"callable", TypeTokenKind::Callable),
488 (b"array", TypeTokenKind::Array),
489 (b"iterable", TypeTokenKind::Iterable),
490 (b"null", TypeTokenKind::Null),
491 (b"mixed", TypeTokenKind::Mixed),
492 (b"resource", TypeTokenKind::Resource),
493 (b"void", TypeTokenKind::Void),
494 (b"scalar", TypeTokenKind::Scalar),
495 (b"numeric", TypeTokenKind::Numeric),
496 (b"never", TypeTokenKind::Never),
497 (b"nothing", TypeTokenKind::Nothing),
498 (b"as", TypeTokenKind::As),
499 (b"is", TypeTokenKind::Is),
500 (b"not", TypeTokenKind::Not),
501 (b"min", TypeTokenKind::Min),
502 (b"max", TypeTokenKind::Max),
503 ];
504
505 let mut length = 1;
506 let mut ended_with_slash = false;
507 loop {
508 match self.input.peek(length, 2) {
509 [part_of_identifier!(), ..] => {
510 length += 1;
511 }
512 [b'\\', start_of_identifier!(), ..] => {
513 ended_with_slash = true;
514 break;
515 }
516 _ => {
517 break;
518 }
519 }
520 }
521
522 if !ended_with_slash {
523 for (value, kind) in KEYWORD_TYPES {
524 let keyword_length = value.len();
525 if keyword_length != length {
526 continue;
527 }
528
529 if self.input.is_at(value, true) {
530 return (kind, keyword_length);
531 }
532 }
533 }
534
535 let mut slashes = 0;
536 let mut last_was_slash = false;
537 loop {
538 match self.input.peek(length, 1) {
539 [start_of_identifier!(), ..] if last_was_slash => {
540 length += 1;
541 last_was_slash = false;
542 }
543 [part_of_identifier!(), ..] if !last_was_slash => {
544 length += 1;
545 }
546 [b'\\', ..] => {
547 if !last_was_slash {
548 length += 1;
549 slashes += 1;
550 last_was_slash = true;
551 } else {
552 length -= 1;
553 slashes -= 1;
554 last_was_slash = false;
555
556 break;
557 }
558 }
559 _ => {
560 break;
561 }
562 }
563 }
564
565 if last_was_slash {
566 length -= 1;
567 slashes -= 1;
568 }
569
570 if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
571 }
572
573 #[inline]
574 fn token(
575 &self,
576 kind: TypeTokenKind,
577 value: &'input [u8],
578 from: Position,
579 to: Position,
580 ) -> Option<Result<TypeToken<'input>, SyntaxError>> {
581 let mut value_chunks = value.utf8_chunks();
582 let value_str = if let Some(chunk) = value_chunks.next() {
583 let valid = chunk.valid();
584
585 debug_assert_eq!(valid.len(), value.len());
586
587 valid
588 } else {
589 ""
590 };
591
592 Some(Ok(TypeToken { kind, value: value_str, span: Span::new(self.file_id(), from, to) }))
593 }
594}
595
596impl HasFileId for TypeLexer<'_> {
597 fn file_id(&self) -> FileId {
598 self.input.file_id()
599 }
600}