1mod keyword;
2
3use mago_database::file::FileId;
4use mago_database::file::HasFileId;
5use mago_span::Position;
6use mago_syntax_core::float_exponent;
7use mago_syntax_core::float_separator;
8use mago_syntax_core::input::Input;
9use mago_syntax_core::number_sign;
10use mago_syntax_core::part_of_identifier;
11use mago_syntax_core::start_of_binary_number;
12use mago_syntax_core::start_of_float_number;
13use mago_syntax_core::start_of_hexadecimal_number;
14use mago_syntax_core::start_of_identifier;
15use mago_syntax_core::start_of_octal_number;
16use mago_syntax_core::start_of_octal_or_float_number;
17use mago_syntax_core::utils::read_digits_of_base;
18
19use crate::error::SyntaxError;
20use crate::token::TypeToken;
21use crate::token::TypeTokenKind;
22
23#[derive(Debug)]
24pub struct TypeLexer<'arena> {
25 input: Input<'arena>,
26}
27
28impl<'arena> TypeLexer<'arena> {
29 #[inline]
30 #[must_use]
31 pub fn new(input: Input<'arena>) -> TypeLexer<'arena> {
32 TypeLexer { input }
33 }
34
35 #[inline]
36 #[must_use]
37 pub fn has_reached_eof(&self) -> bool {
38 self.input.has_reached_eof()
39 }
40
41 #[inline]
42 #[must_use]
43 pub fn current_position(&self) -> Position {
44 self.input.current_position()
45 }
46
47 #[inline]
48 #[must_use]
49 pub fn slice_in_range(&self, from: u32, to: u32) -> &'arena str {
50 let bytes_slice = self.input.slice_in_range(from, to);
51 bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
52 }
53
54 #[inline]
55 pub fn advance(&mut self) -> Option<Result<TypeToken<'arena>, SyntaxError>> {
56 if self.input.has_reached_eof() {
57 return None;
58 }
59
60 let start = self.input.current_position();
61 let whitespaces = self.input.consume_whitespaces();
62 if !whitespaces.is_empty() {
63 let end = self.input.current_position();
64 return Some(Ok(self.token(TypeTokenKind::Whitespace, whitespaces, start, end)));
65 }
66
67 let remaining = self.input.read_remaining();
68 let first = unsafe { *remaining.get_unchecked(0) };
70 let second = remaining.get(1).copied();
71
72 let (kind, length) = match first {
73 b'*' => (TypeTokenKind::Asterisk, 1),
74 b':' => {
75 if second == Some(b':') {
76 (TypeTokenKind::ColonColon, 2)
77 } else {
78 (TypeTokenKind::Colon, 1)
79 }
80 }
81 b'=' => (TypeTokenKind::Equals, 1),
82 b'?' => (TypeTokenKind::Question, 1),
83 b'!' => (TypeTokenKind::Exclamation, 1),
84 b'&' => (TypeTokenKind::Ampersand, 1),
85 b'|' => (TypeTokenKind::Pipe, 1),
86 b'>' => (TypeTokenKind::GreaterThan, 1),
87 b'<' => (TypeTokenKind::LessThan, 1),
88 b'(' => (TypeTokenKind::LeftParenthesis, 1),
89 b')' => (TypeTokenKind::RightParenthesis, 1),
90 b'[' => (TypeTokenKind::LeftBracket, 1),
91 b']' => (TypeTokenKind::RightBracket, 1),
92 b'{' => (TypeTokenKind::LeftBrace, 1),
93 b'}' => (TypeTokenKind::RightBrace, 1),
94 b',' => (TypeTokenKind::Comma, 1),
95 b'+' => (TypeTokenKind::Plus, 1),
96 b'-' => (TypeTokenKind::Minus, 1),
97 b'.' => match remaining.get(..3) {
98 Some([b'.', b'.', b'.']) => (TypeTokenKind::Ellipsis, 3),
99 _ if matches!(second, Some(b'0'..=b'9')) => self.read_decimal(),
100 _ => {
101 return Some(Err(SyntaxError::UnrecognizedToken(
102 self.file_id(),
103 first,
104 self.input.current_position(),
105 )));
106 }
107 },
108 b'/' if second == Some(b'/') => self.read_single_line_comment(),
109 b'\'' | b'"' => self.read_literal_string(first),
110 b'\\' if second.is_some_and(|b| b.is_ascii_alphabetic() || b == b'_' || b >= 0x80) => {
111 self.read_fully_qualified_identifier()
112 }
113 b'$' if second.is_some_and(|b| b.is_ascii_alphabetic() || b == b'_' || b >= 0x80) => self.read_variable(),
114 b'0'..=b'9' => self.read_number(),
115 b if b.is_ascii_alphabetic() || b == b'_' || b >= 0x80 => self.read_identifier_or_keyword(),
116 _ => {
117 return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), first, self.input.current_position())));
118 }
119 };
120
121 let buffer = self.input.consume(length);
122 let end = self.input.current_position();
123
124 Some(Ok(self.token(kind, buffer, start, end)))
125 }
126
127 #[inline]
128 fn read_variable(&self) -> (TypeTokenKind, usize) {
129 let mut length = 2;
130 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
131 length += 1;
132 }
133 (TypeTokenKind::Variable, length)
134 }
135
136 #[inline]
137 fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
138 let mut length = 2;
139 loop {
140 match self.input.peek(length, 1) {
141 [b'\n', ..] | [] => break,
142 [_, ..] => length += 1,
143 }
144 }
145 (TypeTokenKind::SingleLineComment, length)
146 }
147
148 #[inline]
149 fn read_decimal(&self) -> (TypeTokenKind, usize) {
150 let mut length = read_digits_of_base(&self.input, 2, 10);
151 if let float_exponent!() = self.input.peek(length, 1) {
152 length += 1;
153 if let number_sign!() = self.input.peek(length, 1) {
154 length += 1;
155 }
156 length = read_digits_of_base(&self.input, length, 10);
157 }
158 (TypeTokenKind::LiteralFloat, length)
159 }
160
161 #[inline]
162 fn read_number(&self) -> (TypeTokenKind, usize) {
163 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
164 enum NumberKind {
165 Integer,
166 Float,
167 OctalOrFloat,
168 IntegerOrFloat,
169 }
170
171 let mut length = 1;
172 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
173 start_of_binary_number!() => {
174 length += 1;
175 (2, NumberKind::Integer)
176 }
177 start_of_octal_number!() => {
178 length += 1;
179 (8, NumberKind::Integer)
180 }
181 start_of_hexadecimal_number!() => {
182 length += 1;
183 (16, NumberKind::Integer)
184 }
185 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
186 start_of_float_number!() => (10, NumberKind::Float),
187 _ => (10, NumberKind::IntegerOrFloat),
188 };
189
190 if kind != NumberKind::Float {
191 length = read_digits_of_base(&self.input, length, base);
192 if kind == NumberKind::Integer {
193 return (TypeTokenKind::LiteralInteger, length);
194 }
195 }
196
197 let is_float = matches!(self.input.peek(length, 3), float_separator!());
198 if !is_float {
199 return (TypeTokenKind::LiteralInteger, length);
200 }
201
202 if let [b'.'] = self.input.peek(length, 1) {
203 length += 1;
204 length = read_digits_of_base(&self.input, length, 10);
205 }
206
207 if let float_exponent!() = self.input.peek(length, 1) {
208 let mut exp_length = length + 1;
209 if let number_sign!() = self.input.peek(exp_length, 1) {
210 exp_length += 1;
211 }
212
213 let after_exp = read_digits_of_base(&self.input, exp_length, 10);
214 if after_exp > exp_length {
215 length = after_exp;
216 }
217 }
218
219 (TypeTokenKind::LiteralFloat, length)
220 }
221
222 #[inline]
223 fn read_literal_string(&self, quote: u8) -> (TypeTokenKind, usize) {
224 let total = self.input.len();
225 let start = self.input.current_offset();
226 let mut length = 1;
227 let mut last_was_backslash = false;
228 let mut partial = false;
229
230 loop {
231 let pos = start + length;
232 if pos >= total {
233 partial = true;
234 break;
235 }
236
237 let byte = self.input.read_at(pos);
238 if *byte == b'\\' {
239 last_was_backslash = !last_was_backslash;
240 length += 1;
241 } else {
242 if byte == "e && !last_was_backslash {
243 length += 1;
244 break;
245 }
246 length += 1;
247 last_was_backslash = false;
248 }
249 }
250
251 if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
252 }
253
254 #[inline]
255 fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
256 let mut length = 2;
257 let mut last_was_slash = false;
258 loop {
259 match self.input.peek(length, 1) {
260 [start_of_identifier!(), ..] if last_was_slash => {
261 length += 1;
262 last_was_slash = false;
263 }
264 [part_of_identifier!(), ..] if !last_was_slash => {
265 length += 1;
266 }
267 [b'\\', ..] => {
268 if last_was_slash {
269 length -= 1;
270 break;
271 }
272 length += 1;
273 last_was_slash = true;
274 }
275 _ => break,
276 }
277 }
278 (TypeTokenKind::FullyQualifiedIdentifier, length)
279 }
280
281 #[inline]
284 fn read_identifier_or_keyword(&self) -> (TypeTokenKind, usize) {
285 let remaining = self.input.read_remaining();
286 let total = remaining.len();
287 let mut length = 1;
288 let mut next_is_hyphen = false;
289 let mut next_is_backslash = false;
290
291 while length < total {
295 let b = unsafe { *remaining.get_unchecked(length) };
297 if mago_syntax_core::utils::is_part_of_identifier(&b) {
298 length += 1;
299 continue;
300 }
301
302 if b == b'-' && length + 1 < total {
303 let b2 = unsafe { *remaining.get_unchecked(length + 1) };
305 if mago_syntax_core::utils::is_part_of_identifier(&b2) {
306 next_is_hyphen = true;
307 }
308 } else if b == b'\\' && length + 1 < total {
309 let b2 = unsafe { *remaining.get_unchecked(length + 1) };
311 if mago_syntax_core::utils::is_start_of_identifier(&b2) {
312 next_is_backslash = true;
313 }
314 } else {
315 }
317
318 break;
319 }
320
321 if next_is_backslash {
322 return self.finish_qualified_identifier(length);
323 }
324
325 if !next_is_hyphen {
326 let bytes = unsafe { remaining.get_unchecked(..length) };
328 if let Some(kind) = keyword::lookup_keyword(bytes) {
329 return (kind, length);
330 }
331 return (TypeTokenKind::Identifier, length);
332 }
333
334 let base_len = length;
335 while length < total {
336 let b = unsafe { *remaining.get_unchecked(length) };
338 if mago_syntax_core::utils::is_part_of_identifier(&b) {
339 length += 1;
340 continue;
341 }
342
343 if b == b'-' && length + 1 < total {
344 let b2 = unsafe { *remaining.get_unchecked(length + 1) };
346 if mago_syntax_core::utils::is_part_of_identifier(&b2) {
347 length += 1;
348 continue;
349 }
350 }
351
352 break;
353 }
354
355 let bytes = unsafe { remaining.get_unchecked(..length) };
357 if let Some(kind) = keyword::lookup_keyword(bytes) {
358 return (kind, length);
359 }
360
361 let base_bytes = unsafe { remaining.get_unchecked(..base_len) };
363 if let Some(kind) = keyword::lookup_keyword(base_bytes) {
364 return (kind, base_len);
365 }
366
367 (TypeTokenKind::Identifier, base_len)
368 }
369
370 #[inline]
372 fn finish_qualified_identifier(&self, start_len: usize) -> (TypeTokenKind, usize) {
373 let mut length = start_len;
374 let mut slashes = 0;
375 let mut last_was_slash = false;
376
377 loop {
378 match self.input.peek(length, 1) {
379 [start_of_identifier!(), ..] if last_was_slash => {
380 length += 1;
381 last_was_slash = false;
382 }
383 [part_of_identifier!(), ..] if !last_was_slash => {
384 length += 1;
385 }
386 [b'\\', ..] => {
387 if last_was_slash {
388 length -= 1;
389 slashes -= 1;
390 break;
391 }
392 length += 1;
393 slashes += 1;
394 last_was_slash = true;
395 }
396 _ => break,
397 }
398 }
399
400 if last_was_slash {
401 length -= 1;
402 slashes -= 1;
403 }
404
405 if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
406 }
407
408 #[inline]
409 fn token(&self, kind: TypeTokenKind, value: &'arena [u8], start: Position, _end: Position) -> TypeToken<'arena> {
410 let value_str = unsafe { std::str::from_utf8_unchecked(value) };
414 TypeToken { kind, start, value: value_str }
415 }
416}
417
418impl HasFileId for TypeLexer<'_> {
419 #[inline]
420 fn file_id(&self) -> FileId {
421 self.input.file_id()
422 }
423}