1use mago_span::Position;
2use mago_span::Span;
3use mago_syntax_core::float_exponent;
4use mago_syntax_core::float_separator;
5use mago_syntax_core::input::Input;
6use mago_syntax_core::number_sign;
7use mago_syntax_core::part_of_identifier;
8use mago_syntax_core::start_of_binary_number;
9use mago_syntax_core::start_of_float_number;
10use mago_syntax_core::start_of_hexadecimal_number;
11use mago_syntax_core::start_of_identifier;
12use mago_syntax_core::start_of_number;
13use mago_syntax_core::start_of_octal_number;
14use mago_syntax_core::start_of_octal_or_float_number;
15use mago_syntax_core::utils::read_digits_of_base;
16
17use crate::error::SyntaxError;
18use crate::token::TypeToken;
19use crate::token::TypeTokenKind;
20
21#[derive(Debug)]
22pub struct TypeLexer<'input> {
23 input: Input<'input>,
24}
25
26impl<'input> TypeLexer<'input> {
27 pub fn new(input: Input<'input>) -> TypeLexer<'input> {
28 TypeLexer { input }
29 }
30
31 pub fn has_reached_eof(&self) -> bool {
32 self.input.has_reached_eof()
33 }
34
35 pub fn current_position(&self) -> Position {
36 self.input.current_position()
37 }
38
39 #[inline]
49 pub fn slice_in_range(&self, from: usize, to: usize) -> &'input str {
50 let bytes_slice = self.input.slice_in_range(from, to);
51
52 bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
54 }
55
56 #[inline]
57 pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
58 if self.input.has_reached_eof() {
59 return None;
60 }
61
62 let start = self.input.current_position();
63 let whitespaces = self.input.consume_whitespaces();
64 if !whitespaces.is_empty() {
65 let end = self.input.current_position();
66
67 return self.token(TypeTokenKind::Whitespace, whitespaces, start, end);
68 }
69
70 let (kind, length) = match self.input.read(3) {
71 [b'*', ..] => (TypeTokenKind::Asterisk, 1),
72 [b'n' | b'N', b'o' | b'O', b'n' | b'N'] => {
73 if self.input.is_at(b"non-positive-int", true) {
74 (TypeTokenKind::NonPositiveInt, 16)
75 } else if self.input.is_at(b"non-negative-int", true) {
76 (TypeTokenKind::NonNegativeInt, 16)
77 } else if self.input.is_at(b"non-empty-literal-string", true) {
78 (TypeTokenKind::NonEmptyUnspecifiedLiteralString, 26)
79 } else if self.input.is_at(b"non-empty-string", true) {
80 (TypeTokenKind::NonEmptyString, 16)
81 } else if self.input.is_at(b"non-empty-array", true) {
82 (TypeTokenKind::NonEmptyArray, 15)
83 } else if self.input.is_at(b"non-empty-list", true) {
84 (TypeTokenKind::NonEmptyList, 14)
85 } else if self.input.is_at(b"non-falsy-string", true) {
86 (TypeTokenKind::NonFalsyString, 16)
87 } else if self.input.is_at(b"non-empty-lowercase-string", true) {
88 (TypeTokenKind::NonEmptyLowercaseString, 26)
89 } else {
90 self.read_identifier()
91 }
92 }
93 [b'p' | b'P', b'u' | b'U', b'r' | b'R'] => {
94 if self.input.is_at(b"pure-closure", true) {
95 (TypeTokenKind::PureClosure, 12)
96 } else if self.input.is_at(b"pure-callable", true) {
97 (TypeTokenKind::PureCallable, 13)
98 } else {
99 self.read_identifier()
100 }
101 }
102 [b'n' | b'N', b'e' | b'E', b'v' | b'V'] => {
103 if self.input.is_at(b"never-return", true) {
104 (TypeTokenKind::NeverReturn, 12)
105 } else if self.input.is_at(b"never-returns", true) {
106 (TypeTokenKind::NeverReturns, 13)
107 } else {
108 self.read_identifier()
109 }
110 }
111 [b't' | b'T', b'r' | b'R', b'u' | b'U'] => {
112 if self.input.is_at(b"truthy-string", true) {
113 (TypeTokenKind::TruthyString, 13)
114 } else {
115 self.read_identifier()
116 }
117 }
118 [b't' | b'T', b'r' | b'R', b'a' | b'A'] => {
119 if self.input.is_at(b"trait-string", true) {
120 (TypeTokenKind::TraitString, 12)
121 } else {
122 self.read_identifier()
123 }
124 }
125 [b'a' | b'A', b's' | b'S', b's' | b'S'] => {
126 if self.input.is_at(b"associative-array", true) {
127 (TypeTokenKind::AssociativeArray, 17)
128 } else {
129 self.read_identifier()
130 }
131 }
132 [b'c' | b'C', b'l' | b'L', b'a' | b'A'] => {
133 if self.input.is_at(b"class-string", true) {
134 (TypeTokenKind::ClassString, 12)
135 } else {
136 self.read_identifier()
137 }
138 }
139 [b'e' | b'E', b'n' | b'N', b'u' | b'U'] => {
140 if self.input.is_at(b"enum-string", true) {
141 (TypeTokenKind::EnumString, 11)
142 } else {
143 self.read_identifier()
144 }
145 }
146 [b'i' | b'I', b'n' | b'N', b't' | b'T'] => {
147 if self.input.is_at(b"interface-string", true) {
148 (TypeTokenKind::InterfaceString, 16)
149 } else {
150 self.read_identifier()
151 }
152 }
153 [b'c' | b'C', b'l' | b'L', b'o' | b'O'] => {
154 if self.input.is_at(b"closed-resource", true) {
155 (TypeTokenKind::ClosedResource, 15)
156 } else {
157 self.read_identifier()
158 }
159 }
160 [b's' | b'S', b't' | b'T', b'r' | b'R'] => {
161 if self.input.is_at(b"stringable-object", true) {
162 (TypeTokenKind::StringableObject, 17)
163 } else {
164 self.read_identifier()
165 }
166 }
167 [b'n' | b'N', b'u' | b'U', b'm' | b'M'] => {
168 if self.input.is_at(b"numeric-string", true) {
169 (TypeTokenKind::NumericString, 14)
170 } else {
171 self.read_identifier()
172 }
173 }
174 [b'l' | b'L', b'i' | b'I', b't' | b'T'] => {
175 if self.input.is_at(b"literal-string", true) {
176 (TypeTokenKind::UnspecifiedLiteralString, 14)
177 } else if self.input.is_at(b"literal-int", true) {
178 (TypeTokenKind::UnspecifiedLiteralInt, 11)
179 } else {
180 self.read_identifier()
181 }
182 }
183 [b'l' | b'L', b'o' | b'O', b'w' | b'W'] => {
184 if self.input.is_at(b"lowercase-string", true) {
185 (TypeTokenKind::LowercaseString, 16)
186 } else {
187 self.read_identifier()
188 }
189 }
190 [b'o' | b'O', b'p' | b'P', b'e' | b'E'] => {
191 if self.input.is_at(b"open-resource", true) {
192 (TypeTokenKind::OpenResource, 13)
193 } else {
194 self.read_identifier()
195 }
196 }
197 [b'a' | b'A', b'r' | b'R', b'r' | b'R'] => {
198 if self.input.is_at(b"array-key", true) {
199 (TypeTokenKind::ArrayKey, 9)
200 } else {
201 self.read_identifier()
202 }
203 }
204 [b'n' | b'N', b'o' | b'O', b'-'] => {
205 if self.input.is_at(b"no-return", true) {
206 (TypeTokenKind::NoReturn, 9)
207 } else {
208 self.read_identifier()
209 }
210 }
211 [b'v' | b'V', b'a' | b'A', b'l' | b'L'] => {
212 if self.input.is_at(b"value-of", true) {
213 (TypeTokenKind::ValueOf, 8)
214 } else {
215 self.read_identifier()
216 }
217 }
218 [b'k' | b'K', b'e' | b'E', b'y' | b'Y'] => {
219 if self.input.is_at(b"key-of", true) {
220 (TypeTokenKind::KeyOf, 6)
221 } else {
222 self.read_identifier()
223 }
224 }
225 [b'p' | b'P', b'r' | b'R', b'o' | b'O'] => {
226 if self.input.is_at(b"protected-properties-of", true) {
227 (TypeTokenKind::ProtectedPropertiesOf, 23)
228 } else if self.input.is_at(b"properties-of", true) {
229 (TypeTokenKind::PropertiesOf, 13)
230 } else {
231 self.read_identifier()
232 }
233 }
234 [b'p' | b'P', b'u' | b'U', b'b' | b'B'] => {
235 if self.input.is_at(b"public-properties-of", true) {
236 (TypeTokenKind::PublicPropertiesOf, 20)
237 } else {
238 self.read_identifier()
239 }
240 }
241 [b'p' | b'P', b'r' | b'R', b'i' | b'I'] => {
242 if self.input.is_at(b"private-properties-of", true) {
243 (TypeTokenKind::PrivatePropertiesOf, 21)
244 } else {
245 self.read_identifier()
246 }
247 }
248 [b'p' | b'P', b'o' | b'O', b's' | b'S'] => {
249 if self.input.is_at(b"positive-int", true) {
250 (TypeTokenKind::PositiveInt, 12)
251 } else {
252 self.read_identifier()
253 }
254 }
255 [b'n' | b'N', b'e' | b'E', b'g' | b'G'] => {
256 if self.input.is_at(b"negative-int", true) {
257 (TypeTokenKind::NegativeInt, 12)
258 } else {
259 self.read_identifier()
260 }
261 }
262 [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
263 [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
264 [b'/', b'/', ..] => self.read_single_line_comment(),
265 [b'.', start_of_number!(), ..] => self.read_decimal(),
266 [start_of_number!(), ..] => self.read_number(),
267 [quote @ b'\'' | quote @ b'"', ..] => self.read_literal_string(quote),
268 [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
269 [start_of_identifier!(), ..] => self.read_identifier(),
270 [b'$', start_of_identifier!(), ..] => {
271 let mut length = 2;
272 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
273 length += 1;
274 }
275
276 (TypeTokenKind::Variable, length)
277 }
278 [b':', ..] => (TypeTokenKind::Colon, 1),
279 [b'=', ..] => (TypeTokenKind::Equals, 1),
280 [b'?', ..] => (TypeTokenKind::Question, 1),
281 [b'&', ..] => (TypeTokenKind::Ampersand, 1),
282 [b'|', ..] => (TypeTokenKind::Pipe, 1),
283 [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
284 [b'<', ..] => (TypeTokenKind::LessThan, 1),
285 [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
286 [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
287 [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
288 [b']', ..] => (TypeTokenKind::RightBracket, 1),
289 [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
290 [b'}', ..] => (TypeTokenKind::RightBrace, 1),
291 [b',', ..] => (TypeTokenKind::Comma, 1),
292 [b'+', ..] => (TypeTokenKind::Plus, 1),
293 [b'-', ..] => (TypeTokenKind::Minus, 1),
294 [unknown_byte, ..] => {
295 return Some(Err(SyntaxError::UnrecognizedToken(*unknown_byte, self.input.current_position())));
296 }
297 [] => {
298 unreachable!()
299 }
300 };
301
302 let buffer = self.input.consume(length);
303 let end = self.input.current_position();
304
305 self.token(kind, buffer, start, end)
306 }
307
308 fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
309 let mut length = 2;
310 loop {
311 match self.input.peek(length, 1) {
312 [b'\n', ..] | [] => {
313 break;
314 }
315 [_, ..] => {
316 length += 1;
317 }
318 }
319 }
320
321 (TypeTokenKind::SingleLineComment, length)
322 }
323
324 fn read_decimal(&self) -> (TypeTokenKind, usize) {
325 let mut length = read_digits_of_base(&self.input, 2, 10);
326 if let float_exponent!() = self.input.peek(length, 1) {
327 length += 1;
328 if let number_sign!() = self.input.peek(length, 1) {
329 length += 1;
330 }
331
332 length = read_digits_of_base(&self.input, length, 10);
333 }
334
335 (TypeTokenKind::LiteralFloat, length)
336 }
337
338 fn read_number(&self) -> (TypeTokenKind, usize) {
339 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
340 pub enum NumberKind {
341 Integer,
342 Float,
343 OctalOrFloat,
344 IntegerOrFloat,
345 }
346
347 let mut length = 1;
348
349 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
350 start_of_binary_number!() => {
351 length += 1;
352
353 (2, NumberKind::Integer)
354 }
355 start_of_octal_number!() => {
356 length += 1;
357
358 (8, NumberKind::Integer)
359 }
360 start_of_hexadecimal_number!() => {
361 length += 1;
362
363 (16, NumberKind::Integer)
364 }
365 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
366 start_of_float_number!() => (10, NumberKind::Float),
367 _ => (10, NumberKind::IntegerOrFloat),
368 };
369
370 if kind != NumberKind::Float {
371 length = read_digits_of_base(&self.input, length, base);
372
373 if kind == NumberKind::Integer {
374 return (TypeTokenKind::LiteralInteger, length);
375 }
376 }
377
378 let is_float = matches!(self.input.peek(length, 3), float_separator!());
379
380 if !is_float {
381 return (TypeTokenKind::LiteralInteger, length);
382 }
383
384 if let [b'.'] = self.input.peek(length, 1) {
385 length += 1;
386 length = read_digits_of_base(&self.input, length, 10);
387 }
388
389 if let float_exponent!() = self.input.peek(length, 1) {
390 length += 1;
391 if let number_sign!() = self.input.peek(length, 1) {
392 length += 1;
393 }
394
395 length = read_digits_of_base(&self.input, length, 10);
396 }
397
398 (TypeTokenKind::LiteralFloat, length)
399 }
400
401 fn read_literal_string(&self, quote: &u8) -> (TypeTokenKind, usize) {
402 let total = self.input.len();
403 let start = self.input.current_offset();
404 let mut length = 1; let mut last_was_backslash = false;
406 let mut partial = false;
407
408 loop {
409 let pos = start + length;
410 if pos >= total {
411 partial = true;
413 break;
414 }
415
416 let byte = self.input.read_at(pos);
417 if matches!(byte, b'\\') {
418 last_was_backslash = !last_was_backslash;
420 length += 1;
421 } else {
422 if byte == quote && !last_was_backslash {
424 length += 1; break;
426 }
427
428 length += 1;
429 last_was_backslash = false;
430 }
431 }
432
433 if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
434 }
435
436 fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
437 let mut length = 2;
438 let mut last_was_slash = false;
439 loop {
440 match self.input.peek(length, 1) {
441 [start_of_identifier!(), ..] if last_was_slash => {
442 length += 1;
443 last_was_slash = false;
444 }
445 [part_of_identifier!(), ..] if !last_was_slash => {
446 length += 1;
447 }
448 [b'\\', ..] => {
449 if last_was_slash {
450 length -= 1;
451
452 break;
453 }
454
455 length += 1;
456 last_was_slash = true;
457 }
458 _ => {
459 break;
460 }
461 }
462 }
463
464 (TypeTokenKind::FullyQualifiedIdentifier, length)
465 }
466
467 fn read_identifier(&self) -> (TypeTokenKind, usize) {
468 const KEYWORD_TYPES: [(&[u8], TypeTokenKind); 28] = [
469 (b"list", TypeTokenKind::List),
470 (b"int", TypeTokenKind::Int),
471 (b"integer", TypeTokenKind::Integer),
472 (b"string", TypeTokenKind::String),
473 (b"float", TypeTokenKind::Float),
474 (b"double", TypeTokenKind::Double),
475 (b"real", TypeTokenKind::Real),
476 (b"bool", TypeTokenKind::Bool),
477 (b"boolean", TypeTokenKind::Boolean),
478 (b"false", TypeTokenKind::False),
479 (b"true", TypeTokenKind::True),
480 (b"object", TypeTokenKind::Object),
481 (b"callable", TypeTokenKind::Callable),
482 (b"array", TypeTokenKind::Array),
483 (b"iterable", TypeTokenKind::Iterable),
484 (b"null", TypeTokenKind::Null),
485 (b"mixed", TypeTokenKind::Mixed),
486 (b"resource", TypeTokenKind::Resource),
487 (b"void", TypeTokenKind::Void),
488 (b"scalar", TypeTokenKind::Scalar),
489 (b"numeric", TypeTokenKind::Numeric),
490 (b"never", TypeTokenKind::Never),
491 (b"nothing", TypeTokenKind::Nothing),
492 (b"as", TypeTokenKind::As),
493 (b"is", TypeTokenKind::Is),
494 (b"not", TypeTokenKind::Not),
495 (b"min", TypeTokenKind::Min),
496 (b"max", TypeTokenKind::Max),
497 ];
498
499 let mut length = 1;
500 let mut ended_with_slash = false;
501 loop {
502 match self.input.peek(length, 2) {
503 [part_of_identifier!(), ..] => {
504 length += 1;
505 }
506 [b'\\', start_of_identifier!(), ..] => {
507 ended_with_slash = true;
508 break;
509 }
510 _ => {
511 break;
512 }
513 }
514 }
515
516 if !ended_with_slash {
517 for (value, kind) in KEYWORD_TYPES {
518 let keyword_length = value.len();
519 if keyword_length != length {
520 continue;
521 }
522
523 if self.input.is_at(value, true) {
524 return (kind, keyword_length);
525 }
526 }
527 }
528
529 let mut slashes = 0;
530 let mut last_was_slash = false;
531 loop {
532 match self.input.peek(length, 1) {
533 [start_of_identifier!(), ..] if last_was_slash => {
534 length += 1;
535 last_was_slash = false;
536 }
537 [part_of_identifier!(), ..] if !last_was_slash => {
538 length += 1;
539 }
540 [b'\\', ..] => {
541 if !last_was_slash {
542 length += 1;
543 slashes += 1;
544 last_was_slash = true;
545 } else {
546 length -= 1;
547 slashes -= 1;
548 last_was_slash = false;
549
550 break;
551 }
552 }
553 _ => {
554 break;
555 }
556 }
557 }
558
559 if last_was_slash {
560 length -= 1;
561 slashes -= 1;
562 }
563
564 if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
565 }
566
567 #[inline]
568 fn token(
569 &self,
570 kind: TypeTokenKind,
571 value: &'input [u8],
572 from: Position,
573 to: Position,
574 ) -> Option<Result<TypeToken<'input>, SyntaxError>> {
575 let mut value_chunks = value.utf8_chunks();
576 let value_str = if let Some(chunk) = value_chunks.next() {
577 let valid = chunk.valid();
578
579 debug_assert_eq!(valid.len(), value.len());
580
581 valid
582 } else {
583 ""
584 };
585
586 Some(Ok(TypeToken { kind, value: value_str, span: Span::new(from, to) }))
587 }
588}