1use std::error::Error;
2use std::fmt;
3
4#[derive(Debug, Clone, PartialEq)]
5pub enum Token {
6 Pipeline, Class, Id, Tag, Attr, Text, Src, Href, Pound,
21
22 Function(String), Comma, Colon,
29
30 String(String),
32 Float(f64),
33 Number(usize),
34 Bool(bool),
35 Nil,
36 Dot,
38 DotDot,
40
41 Minus,
43
44 Tilde,
46
47 LeftParen,
49 RightParen,
51
52 LeftBracket,
54 RightBracket,
56
57 Union,
59 Intersection,
61 Difference,
63
64 EOF,
65}
66
67impl fmt::Display for Token {
68 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
69 match self {
70 Token::Pipeline => write!(f, ">"),
71 Token::Class => write!(f, "class"),
72 Token::Id => write!(f, "id"),
73 Token::Tag => write!(f, "tag"),
74 Token::Attr => write!(f, "attr"),
75 Token::Text => write!(f, "text"),
76 Token::Src => write!(f, "src"),
77 Token::Href => write!(f, "href"),
78 Token::Tilde => write!(f, "~"),
79 Token::Function(func) => write!(f, "@{}", func),
80 Token::Comma => write!(f, ","),
81 Token::Colon => write!(f, ":"),
82 Token::Number(n) => write!(f, "{}", n),
83 Token::String(s) => write!(f, "{}", s),
84 Token::Float(n) => write!(f, "{}", n),
85 Token::Bool(b) => write!(f, "{}", b),
86 Token::Minus => write!(f, "-"),
87 Token::LeftParen => write!(f, "("),
88 Token::RightParen => write!(f, ")"),
89 Token::LeftBracket => write!(f, "["),
90 Token::RightBracket => write!(f, "]"),
91 Token::Union => write!(f, "|"),
92 Token::Intersection => write!(f, "&"),
93 Token::Difference => write!(f, "^"),
94 Token::Dot => write!(f, "."),
95 Token::DotDot => write!(f, ".."),
96 Token::Nil => write!(f, "nil"),
97 Token::EOF => write!(f, "EOF"),
98 Token::Pound => write!(f, "#"),
99 }
100 }
101}
102
103#[derive(Debug)]
104pub struct LexerError {
105 pub message: String,
106 pub line: usize,
107 pub column: usize,
108}
109
110impl fmt::Display for LexerError {
111 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
112 write!(
113 f,
114 "Lexical error(line {}, column {}): {}",
115 self.line, self.column, self.message
116 )
117 }
118}
119
120impl Error for LexerError {}
121
122pub struct Lexer {
124 chars: Vec<char>,
126 position: usize,
128 read_position: usize,
130 current_char: Option<char>,
132 line: usize,
134 column: usize,
136}
137
138impl Lexer {
139 pub fn new(input: &str) -> Self {
140 let estimated_capacity = input.len() + 1;
141 let mut chars = Vec::with_capacity(estimated_capacity);
142 chars.extend(input.chars());
143
144 let mut lexer = Lexer {
145 chars,
146 position: 0,
147 read_position: 0,
148 current_char: None,
149 line: 1,
150 column: 0,
151 };
152
153 lexer.read_char();
154 lexer
155 }
156
157 fn read_char(&mut self) {
159 if self.read_position >= self.chars.len() {
160 self.current_char = None;
161 } else {
162 self.current_char = Some(self.chars[self.read_position]);
163 }
164
165 self.position = self.read_position;
166 self.read_position += 1;
167
168 if let Some('\n') = self.current_char {
169 self.line += 1;
170 self.column = 0;
171 } else {
172 self.column += 1;
173 }
174 }
175
176 fn skip_whitespace(&mut self) {
178 while let Some(c) = self.current_char {
179 if c.is_whitespace() {
180 self.read_char();
181 } else {
182 break;
183 }
184 }
185 }
186
187 fn is_identifier_start(&self, c: char) -> bool {
189 c.is_alphabetic() || c == '_' || Self::is_unicode_identifier_part(c)
190 }
191
192 fn is_identifier_part(&self, c: char) -> bool {
194 c.is_alphanumeric() || c == '_' || Self::is_unicode_identifier_part(c)
195 }
196
197 fn is_unicode_identifier_part(c: char) -> bool {
199 (c >= '\u{4E00}' && c <= '\u{9FFF}')
200 || (c >= '\u{3040}' && c <= '\u{309F}')
201 || (c >= '\u{30A0}' && c <= '\u{30FF}')
202 || (c >= '\u{AC00}' && c <= '\u{D7AF}')
203 || (c >= '\u{1F600}' && c <= '\u{1F64F}')
204 || (c >= '\u{1F300}' && c <= '\u{1F5FF}')
205 || (c >= '\u{1F680}' && c <= '\u{1F6FF}')
206 || (c >= '\u{2600}' && c <= '\u{26FF}')
207 }
208
209 fn is_function_name_start(&self, c: char) -> bool {
211 c.is_ascii_alphabetic()
212 }
213
214 fn is_function_name_part(&self, c: char) -> bool {
216 c.is_ascii_alphanumeric() || c == '_'
217 }
218
219 pub fn next_token(&mut self) -> Result<Token, LexerError> {
221 self.skip_whitespace();
222
223 if self.current_char.is_none() {
224 return Ok(Token::EOF);
225 }
226
227 match self.current_char.unwrap() {
228 '>' => {
229 self.read_char();
230 Ok(Token::Pipeline)
231 }
232 ',' => {
233 self.read_char();
234 Ok(Token::Comma)
235 }
236 ':' => {
237 self.read_char();
238 Ok(Token::Colon)
239 }
240 '|' => {
241 self.read_char();
242 Ok(Token::Union)
243 }
244 '^' => {
245 self.read_char();
246 Ok(Token::Difference)
247 }
248 '&' => {
249 self.read_char();
250 Ok(Token::Intersection)
251 }
252 '@' => self.read_function(),
253 '"' => self.read_quoted_string(),
254 '~' => {
255 self.read_char();
256 Ok(Token::Tilde)
257 }
258 '(' => {
259 self.read_char();
260 Ok(Token::LeftParen)
261 }
262 ')' => {
263 self.read_char();
264 Ok(Token::RightParen)
265 }
266 '[' => {
267 self.read_char();
268 Ok(Token::LeftBracket)
269 }
270 ']' => {
271 self.read_char();
272 Ok(Token::RightBracket)
273 }
274 '.' => {
275 if self.chars[self.read_position].is_ascii_digit() {
276 self.read_number(true)
277 } else {
278 self.read_char();
279 Ok(Token::Dot)
280 }
281 }
282 '-' => {
283 self.read_char();
284 Ok(Token::Minus)
285 }
286 '0'..='9' => self.read_number(false),
287 '#' => {
288 self.read_char();
289 Ok(Token::Pound)
290 }
291 _ => self.read_string(),
292 }
293 }
294
295 fn read_number(&mut self, has_dot: bool) -> Result<Token, LexerError> {
297 let start_position = self.position;
298
299 let mut has_dot_ = has_dot;
300 let mut end = 0;
301
302 if has_dot_ {
303 self.read_char();
304 }
305
306 while let Some(c) = self.current_char {
307 if c.is_ascii_digit() {
308 self.read_char();
309 end = self.position;
310 } else if c == '.' {
311 if !has_dot_ {
312 has_dot_ = true;
313 self.read_char();
314 end = self.position;
315 } else {
316 self.read_char();
317 while let Some(c1) = self.current_char {
318 if c1.is_ascii_digit() {
319 self.read_char();
320 } else {
321 break;
322 }
323 }
324 }
325 } else {
326 break;
327 }
328 }
329
330 if has_dot_ {
331 let number_str: String = self.chars[start_position..end].iter().collect();
332 match number_str.parse::<f64>() {
333 Ok(float) => Ok(Token::Float(float)),
334 Err(_) => Err(LexerError {
335 message: format!("Unable to resolve the float: {}", number_str),
336 line: self.line,
337 column: self.column,
338 }),
339 }
340 } else {
341 let number_str: String = self.chars[start_position..self.position].iter().collect();
342 match number_str.parse::<usize>() {
343 Ok(number) => Ok(Token::Number(number)),
344 Err(_) => Err(LexerError {
345 message: format!("Unable to resolve the number: {}", number_str),
346 line: self.line,
347 column: self.column,
348 }),
349 }
350 }
351 }
352
353 #[deprecated(note = "Keyword analysis has been added to the function for reading argument.")]
355 #[allow(dead_code)]
356 fn read_identifier(&mut self) -> Result<Token, LexerError> {
357 let start_position = self.position;
358
359 while let Some(c) = self.current_char {
360 if self.is_identifier_part(c) {
361 self.read_char();
362 } else {
363 break;
364 }
365 }
366
367 let identifier: String = self.chars[start_position..self.position].iter().collect();
368
369 match identifier.as_str() {
370 "class" => Ok(Token::Class),
371 "id" => Ok(Token::Id),
372 "tag" => Ok(Token::Tag),
373 "attr" => Ok(Token::Attr),
374 "text" => Ok(Token::Text),
375 "src" => Ok(Token::Src),
376 "href" => Ok(Token::Href),
377 _ => Err(LexerError {
378 message: "Illegal identifier".to_string(),
379 line: self.line,
380 column: self.column,
381 }),
382 }
383 }
384
385 fn read_function(&mut self) -> Result<Token, LexerError> {
387 self.read_char();
388
389 let start_position = self.position;
390
391 if let Some(c) = self.current_char {
392 if !self.is_function_name_start(c) {
393 return Err(LexerError {
394 message: "Function names must start with a letter.".to_string(),
395 line: self.line,
396 column: self.column,
397 });
398 }
399 } else {
400 return Err(LexerError {
401 message: "Function name cannot be empty.".to_string(),
402 line: self.line,
403 column: self.column,
404 });
405 }
406
407 while let Some(c) = self.current_char {
408 if self.is_function_name_part(c) {
409 self.read_char();
410 } else {
411 break;
412 }
413 }
414
415 let function_name: String = self.chars[start_position..self.position].iter().collect();
416
417 Ok(Token::Function(function_name))
418 }
419
420 fn read_quoted_string(&mut self) -> Result<Token, LexerError> {
422 self.read_char();
423
424 let mut value = String::new();
425 let mut escaped = false;
426
427 while let Some(c) = self.current_char {
428 if escaped {
429 match c {
430 '"' => value.push('"'),
431 '\\' => value.push('\\'),
432 'n' => value.push('\n'),
433 't' => value.push('\t'),
434 'r' => value.push('\r'),
435 'u' => {
436 let mut unicode_value = String::new();
437 for _ in 0..4 {
438 self.read_char();
439 if let Some(hex_char) = self.current_char {
440 if hex_char.is_ascii_hexdigit() {
441 unicode_value.push(hex_char);
442 } else {
443 return Err(LexerError {
444 message: format!(
445 "Invalid Unicode escape sequence: \\u{}",
446 unicode_value
447 ),
448 line: self.line,
449 column: self.column,
450 });
451 }
452 } else {
453 return Err(LexerError {
454 message: "Unfinished Unicode escape sequence.".to_string(),
455 line: self.line,
456 column: self.column,
457 });
458 }
459 }
460
461 if let Ok(code_point) = u32::from_str_radix(&unicode_value, 16) {
463 if let Some(unicode_char) = std::char::from_u32(code_point) {
464 value.push(unicode_char);
465 } else {
466 return Err(LexerError {
467 message: format!(
468 "Invalid Unicode code point: U+{}",
469 unicode_value
470 ),
471 line: self.line,
472 column: self.column,
473 });
474 }
475 } else {
476 return Err(LexerError {
477 message: format!(
478 "Unable to resolve Unicode escape sequence: \\u{}",
479 unicode_value
480 ),
481 line: self.line,
482 column: self.column,
483 });
484 }
485 }
486 _ => value.push(c),
487 }
488 escaped = false;
489 self.read_char();
490 } else if c == '\\' {
491 escaped = true;
492 self.read_char();
493 } else if c == '"' {
494 self.read_char();
495 return Ok(Token::String(value));
496 } else {
497 value.push(c);
498 self.read_char();
499 }
500 }
501
502 Err(LexerError {
503 message: "Unterminated string.".to_string(),
504 line: self.line,
505 column: self.column,
506 })
507 }
508
509 fn read_string(&mut self) -> Result<Token, LexerError> {
511 let start_position = self.position;
512
513 while let Some(c) = self.current_char {
514 if c.is_whitespace() || c == '>' || c == ',' || c == '"' || c == '@' || c == ':' {
515 break;
516 }
517 self.read_char();
518 }
519
520 let argument: String = self.chars[start_position..self.position].iter().collect();
521
522 if argument.is_empty() {
523 return Err(LexerError {
524 message: format!("Unrecognized characters: {:?}", self.current_char),
525 line: self.line,
526 column: self.column,
527 });
528 }
529
530 match argument.as_str() {
532 "class" => Ok(Token::Class),
533 "id" => Ok(Token::Id),
534 "tag" => Ok(Token::Tag),
535 "attr" => Ok(Token::Attr),
536 "text" => Ok(Token::Text),
537 "src" => Ok(Token::Src),
538 "href" => Ok(Token::Href),
539 "true" => Ok(Token::Bool(true)),
540 "false" => Ok(Token::Bool(false)),
541 "nil" => Ok(Token::Nil),
542 _ => Ok(Token::String(argument)),
543 }
544 }
545
546 fn recover_from_error(&mut self) {
547 while let Some(c) = self.current_char {
548 if c == '>' || c == ',' || c == '"' || c == '@' || self.is_identifier_start(c) {
549 break;
550 }
551 self.read_char();
552 }
553 }
554}
555
556pub fn tokenize(input: &str) -> Vec<(Token, usize, usize)> {
557 let mut lexer = Lexer::new(input);
558
559 let estimated_tokens = (input.len() / 4).max(8);
560 let mut tokens_with_pos = Vec::with_capacity(estimated_tokens);
561
562 loop {
563 let line = lexer.line;
564 let column = lexer.column;
565
566 match lexer.next_token() {
567 Ok(Token::EOF) => {
568 tokens_with_pos.push((Token::EOF, line, column));
569 break;
570 }
571 Ok(token) => tokens_with_pos.push((token, line, column)),
572 Err(_) => lexer.recover_from_error(),
573 }
574 }
575 tokens_with_pos
576}