TypeScript_Rust_Compiler/
lexer_utf8.rs1use crate::error::{CompilerError, Result};
4use crate::lexer::{Token, Keyword};
5
6pub struct Utf8Lexer {
8 chars: Vec<char>,
9 position: usize,
10 line: usize,
11 column: usize,
12}
13
14impl Utf8Lexer {
15 pub fn new(input: String) -> Self {
17 Self {
18 chars: input.chars().collect(),
19 position: 0,
20 line: 1,
21 column: 1,
22 }
23 }
24
25 pub fn tokenize(&mut self) -> Result<Vec<Token>> {
27 let mut tokens = Vec::new();
28
29 while self.position < self.chars.len() {
30 match self.next_token()? {
31 Some(token) => {
32 tokens.push(token);
33 }
34 None => break,
35 }
36 }
37
38 tokens.push(Token::EOF);
39 Ok(tokens)
40 }
41
42 fn next_token(&mut self) -> Result<Option<Token>> {
44 self.skip_whitespace();
45
46 if self.position >= self.chars.len() {
47 return Ok(None);
48 }
49
50 let ch = self.current_char();
51 let token = match ch {
52 '+' => Ok(Some(Token::Plus)),
53 '-' => Ok(Some(Token::Minus)),
54 '*' => Ok(Some(Token::Multiply)),
55 '/' => {
56 if self.peek_char() == Some('^') {
58 Ok(self.parse_regex()?)
59 } else {
60 Ok(Some(Token::Divide))
61 }
62 },
63 '%' => Ok(Some(Token::Modulo)),
64 '=' => {
65 if self.peek_char() == Some('=') {
66 self.advance();
67 if self.peek_char() == Some('=') {
68 self.advance();
69 Ok(Some(Token::StrictEqual))
70 } else {
71 Ok(Some(Token::Equal))
72 }
73 } else if self.peek_char() == Some('>') {
74 self.advance();
75 Ok(Some(Token::Arrow))
76 } else {
77 Ok(Some(Token::Assign))
78 }
79 }
80 '!' => {
81 if self.peek_char() == Some('=') {
82 self.advance();
83 if self.peek_char() == Some('=') {
84 self.advance();
85 Ok(Some(Token::StrictNotEqual))
86 } else {
87 Ok(Some(Token::NotEqual))
88 }
89 } else {
90 Ok(Some(Token::Not))
91 }
92 }
93 '<' => {
94 if self.peek_char() == Some('=') {
95 self.advance();
96 Ok(Some(Token::LessEqual))
97 } else {
98 Ok(Some(Token::LessThan))
99 }
100 }
101 '>' => {
102 if self.peek_char() == Some('=') {
103 self.advance();
104 Ok(Some(Token::GreaterEqual))
105 } else {
106 Ok(Some(Token::GreaterThan))
107 }
108 }
109 '&' => {
110 if self.peek_char() == Some('&') {
111 self.advance();
112 Ok(Some(Token::And))
113 } else {
114 Ok(Some(Token::Intersection))
115 }
116 }
117 '|' => {
118 if self.peek_char() == Some('|') {
119 self.advance();
120 Ok(Some(Token::Or))
121 } else {
122 Ok(Some(Token::Union))
123 }
124 }
125 '(' => Ok(Some(Token::LeftParen)),
126 ')' => Ok(Some(Token::RightParen)),
127 '{' => Ok(Some(Token::LeftBrace)),
128 '}' => Ok(Some(Token::RightBrace)),
129 '[' => Ok(Some(Token::LeftBracket)),
130 ']' => Ok(Some(Token::RightBracket)),
131 ';' => Ok(Some(Token::Semicolon)),
132 ',' => Ok(Some(Token::Comma)),
133 '.' => Ok(Some(Token::Dot)),
134 ':' => Ok(Some(Token::Colon)),
135 '?' => Ok(Some(Token::QuestionMark)),
136 '@' => Ok(Some(Token::At)), '"' | '\'' => Ok(self.parse_string()?),
138 '`' => Ok(self.parse_template_literal()?),
139 '0'..='9' => Ok(self.parse_number()?),
140 _ if ch.is_alphabetic() || ch == '_' || ch == '$' => Ok(self.parse_identifier_or_keyword()?),
141 _ => {
142 return Err(CompilerError::parse_error(
143 self.line,
144 self.column,
145 format!("Unexpected character: {}", ch),
146 ));
147 }
148 };
149
150 match ch {
152 '0'..='9' => {
153 }
155 '"' | '\'' => {
156 }
158 _ if ch.is_alphabetic() || ch == '_' || ch == '$' => {
159 }
161 _ => {
162 self.advance();
164 }
165 }
166 token
167 }
168
169 fn current_char(&self) -> char {
171 self.chars.get(self.position).copied().unwrap_or('\0')
172 }
173
174 fn peek_char(&self) -> Option<char> {
176 self.chars.get(self.position + 1).copied()
177 }
178
179 fn advance(&mut self) {
181 if self.position < self.chars.len() {
182 let ch = self.current_char();
183 if ch == '\n' {
184 self.line += 1;
185 self.column = 1;
186 } else {
187 self.column += 1;
188 }
189 self.position += 1;
190 }
191 }
192
193 fn skip_whitespace(&mut self) {
195 while self.position < self.chars.len() {
196 let ch = self.current_char();
197 if ch.is_whitespace() {
198 self.advance();
199 } else if ch == '/' && self.peek_char() == Some('/') {
200 self.advance(); self.advance(); while self.position < self.chars.len() && self.current_char() != '\n' {
204 self.advance();
205 }
206 } else if ch == '/' && self.peek_char() == Some('*') {
207 self.advance(); self.advance(); while self.position < self.chars.len() {
211 if self.current_char() == '*' && self.peek_char() == Some('/') {
212 self.advance(); self.advance(); break;
215 }
216 self.advance();
217 }
218 } else {
219 break;
220 }
221 }
222 }
223
224 fn parse_string(&mut self) -> Result<Option<Token>> {
226 let quote = self.current_char();
227 let mut value = String::new();
228 self.advance();
229
230 while self.position < self.chars.len() {
231 let ch = self.current_char();
232 if ch == quote {
233 self.advance();
234 return Ok(Some(Token::String(value)));
235 } else if ch == '\\' {
236 self.advance();
237 if self.position < self.chars.len() {
238 let escaped = self.current_char();
239 value.push(match escaped {
240 'n' => '\n',
241 't' => '\t',
242 'r' => '\r',
243 '\\' => '\\',
244 '"' => '"',
245 '\'' => '\'',
246 _ => escaped,
247 });
248 self.advance();
249 }
250 } else {
251 value.push(ch);
252 self.advance();
253 }
254 }
255
256 Err(CompilerError::parse_error(
257 self.line,
258 self.column,
259 "Unterminated string literal",
260 ))
261 }
262
263 fn parse_template_literal(&mut self) -> Result<Option<Token>> {
265 let mut value = String::new();
266 self.advance();
267
268 while self.position < self.chars.len() {
269 let ch = self.current_char();
270 if ch == '`' {
271 self.advance();
272 return Ok(Some(Token::TemplateLiteral(value)));
273 } else if ch == '\\' {
274 self.advance();
275 if self.position < self.chars.len() {
276 let escaped = self.current_char();
277 value.push(match escaped {
278 'n' => '\n',
279 't' => '\t',
280 'r' => '\r',
281 '\\' => '\\',
282 '`' => '`',
283 _ => escaped,
284 });
285 self.advance();
286 }
287 } else {
288 value.push(ch);
289 self.advance();
290 }
291 }
292
293 Err(CompilerError::parse_error(
294 self.line,
295 self.column,
296 "Unterminated template literal",
297 ))
298 }
299
300 fn parse_number(&mut self) -> Result<Option<Token>> {
302 let mut value = String::new();
303
304 while self.position < self.chars.len() {
305 let ch = self.current_char();
306 if ch.is_ascii_digit() || ch == '.' {
307 value.push(ch);
308 self.advance();
309 } else {
310 break;
311 }
312 }
313
314 match value.parse::<f64>() {
315 Ok(num) => Ok(Some(Token::Number(num))),
316 Err(_) => Err(CompilerError::parse_error(
317 self.line,
318 self.column,
319 format!("Invalid number: {}", value),
320 )),
321 }
322 }
323
324 fn parse_identifier_or_keyword(&mut self) -> Result<Option<Token>> {
326 let mut value = String::new();
327
328 while self.position < self.chars.len() {
329 let ch = self.current_char();
330 if ch.is_alphanumeric() || ch == '_' || ch == '$' {
331 value.push(ch);
332 self.advance();
333 } else {
334 break;
335 }
336 }
337
338 if let Some(keyword) = self.parse_keyword(&value) {
340 Ok(Some(Token::Keyword(keyword)))
341 } else {
342 Ok(Some(Token::Identifier(value)))
343 }
344 }
345
346 fn parse_keyword(&self, value: &str) -> Option<Keyword> {
348 match value {
349 "let" => Some(Keyword::Let),
350 "const" => Some(Keyword::Const),
351 "var" => Some(Keyword::Var),
352 "function" => Some(Keyword::Function),
353 "class" => Some(Keyword::Class),
354 "interface" => Some(Keyword::Interface),
355 "type" => Some(Keyword::Type),
356 "enum" => Some(Keyword::Enum),
357 "namespace" => Some(Keyword::Namespace),
358 "module" => Some(Keyword::Module),
359 "export" => Some(Keyword::Export),
360 "import" => Some(Keyword::Import),
361 "public" => Some(Keyword::Public),
362 "private" => Some(Keyword::Private),
363 "protected" => Some(Keyword::Protected),
364 "static" => Some(Keyword::Static),
365 "readonly" => Some(Keyword::Readonly),
366 "abstract" => Some(Keyword::Abstract),
367 "async" => Some(Keyword::Async),
368 "await" => Some(Keyword::Await),
369 "extends" => Some(Keyword::Extends),
370 "implements" => Some(Keyword::Implements),
371 "constructor" => Some(Keyword::Constructor),
372 "get" => Some(Keyword::Get),
373 "set" => Some(Keyword::Set),
374 "this" => Some(Keyword::This),
375 "super" => Some(Keyword::Super),
376 "new" => Some(Keyword::New),
377 "return" => Some(Keyword::Return),
378 "if" => Some(Keyword::If),
379 "else" => Some(Keyword::Else),
380 "while" => Some(Keyword::While),
381 "for" => Some(Keyword::For),
382 "do" => Some(Keyword::Do),
383 "break" => Some(Keyword::Break),
384 "continue" => Some(Keyword::Continue),
385 "switch" => Some(Keyword::Switch),
386 "case" => Some(Keyword::Case),
387 "default" => Some(Keyword::Default),
388 "try" => Some(Keyword::Try),
389 "catch" => Some(Keyword::Catch),
390 "finally" => Some(Keyword::Finally),
391 "throw" => Some(Keyword::Throw),
392 "true" => Some(Keyword::True),
393 "false" => Some(Keyword::False),
394 "null" => Some(Keyword::Null),
395 "undefined" => Some(Keyword::Undefined),
396 "void" => Some(Keyword::Void),
397 "never" => Some(Keyword::Never),
398 "any" => Some(Keyword::Any),
399 "unknown" => Some(Keyword::Unknown),
400 "object" => Some(Keyword::Object),
401 "string" => Some(Keyword::String),
402 "number" => Some(Keyword::Number),
403 "boolean" => Some(Keyword::Boolean),
404 "symbol" => Some(Keyword::Symbol),
405 "bigint" => Some(Keyword::BigInt),
406 "typeof" => Some(Keyword::Typeof),
407 _ => None,
408 }
409 }
410
411 fn parse_regex(&mut self) -> Result<Option<Token>> {
413 let mut pattern = String::new();
414 let mut flags = String::new();
415
416 self.advance(); while self.position < self.chars.len() {
420 let ch = self.current_char();
421 if ch == '/' {
422 self.advance();
423 break;
424 } else if ch == '\\' {
425 pattern.push(ch);
427 self.advance();
428 if self.position < self.chars.len() {
429 pattern.push(self.current_char());
430 self.advance();
431 }
432 } else {
433 pattern.push(ch);
434 self.advance();
435 }
436 }
437
438 while self.position < self.chars.len() {
440 let ch = self.current_char();
441 if ch.is_alphabetic() {
442 flags.push(ch);
443 self.advance();
444 } else {
445 break;
446 }
447 }
448
449 Ok(Some(Token::RegExp(pattern, flags)))
450 }
451}