htmls/
lexer.rs

1use std::error::Error;
2use std::fmt;
3
4#[derive(Debug, Clone, PartialEq)]
5pub enum Token {
6    // pipe operator
7    Pipeline, // >
8
9    // element selection instruction
10    Class, // class
11    Id,    // id
12    Tag,   // tag
13    Attr,  // attr
14
15    // text selection instruction
16    Text, // text
17    Src,  // src
18    Href, // href
19    // attribute value text selection: #
20    Pound,
21
22    // function call
23    Function(String), // @name
24    // function parameter separator
25    Comma, // ,
26
27    // :
28    Colon,
29
30    // literal
31    String(String),
32    Float(f64),
33    Number(usize),
34    Bool(bool),
35    Nil,
36    // .
37    Dot,
38    // ..
39    DotDot,
40
41    // -
42    Minus,
43
44    // ~
45    Tilde,
46
47    // (
48    LeftParen,
49    // )
50    RightParen,
51
52    // [
53    LeftBracket,
54    // ]
55    RightBracket,
56
57    // |
58    Union,
59    // &
60    Intersection,
61    // ^
62    Difference,
63
64    EOF,
65}
66
67impl fmt::Display for Token {
68    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
69        match self {
70            Token::Pipeline => write!(f, ">"),
71            Token::Class => write!(f, "class"),
72            Token::Id => write!(f, "id"),
73            Token::Tag => write!(f, "tag"),
74            Token::Attr => write!(f, "attr"),
75            Token::Text => write!(f, "text"),
76            Token::Src => write!(f, "src"),
77            Token::Href => write!(f, "href"),
78            Token::Tilde => write!(f, "~"),
79            Token::Function(func) => write!(f, "@{}", func),
80            Token::Comma => write!(f, ","),
81            Token::Colon => write!(f, ":"),
82            Token::Number(n) => write!(f, "{}", n),
83            Token::String(s) => write!(f, "{}", s),
84            Token::Float(n) => write!(f, "{}", n),
85            Token::Bool(b) => write!(f, "{}", b),
86            Token::Minus => write!(f, "-"),
87            Token::LeftParen => write!(f, "("),
88            Token::RightParen => write!(f, ")"),
89            Token::LeftBracket => write!(f, "["),
90            Token::RightBracket => write!(f, "]"),
91            Token::Union => write!(f, "|"),
92            Token::Intersection => write!(f, "&"),
93            Token::Difference => write!(f, "^"),
94            Token::Dot => write!(f, "."),
95            Token::DotDot => write!(f, ".."),
96            Token::Nil => write!(f, "nil"),
97            Token::EOF => write!(f, "EOF"),
98            Token::Pound => write!(f, "#"),
99        }
100    }
101}
102
103#[derive(Debug)]
104pub struct LexerError {
105    pub message: String,
106    pub line: usize,
107    pub column: usize,
108}
109
110impl fmt::Display for LexerError {
111    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
112        write!(
113            f,
114            "Lexical error(line {}, column {}): {}",
115            self.line, self.column, self.message
116        )
117    }
118}
119
120impl Error for LexerError {}
121
122/// lexical analyzer
123pub struct Lexer {
124    // character buffer
125    chars: Vec<char>,
126    // current processing location
127    position: usize,
128    // pre-read position
129    read_position: usize,
130    // current character
131    current_char: Option<char>,
132    // current line number
133    line: usize,
134    // current column number
135    column: usize,
136}
137
138impl Lexer {
139    pub fn new(input: &str) -> Self {
140        let estimated_capacity = input.len() + 1;
141        let mut chars = Vec::with_capacity(estimated_capacity);
142        chars.extend(input.chars());
143
144        let mut lexer = Lexer {
145            chars,
146            position: 0,
147            read_position: 0,
148            current_char: None,
149            line: 1,
150            column: 0,
151        };
152
153        lexer.read_char();
154        lexer
155    }
156
157    /// Read the next character and update the position information.
158    fn read_char(&mut self) {
159        if self.read_position >= self.chars.len() {
160            self.current_char = None;
161        } else {
162            self.current_char = Some(self.chars[self.read_position]);
163        }
164
165        self.position = self.read_position;
166        self.read_position += 1;
167
168        if let Some('\n') = self.current_char {
169            self.line += 1;
170            self.column = 0;
171        } else {
172            self.column += 1;
173        }
174    }
175
176    /// Skip whitespace characters.
177    fn skip_whitespace(&mut self) {
178        while let Some(c) = self.current_char {
179            if c.is_whitespace() {
180                self.read_char();
181            } else {
182                break;
183            }
184        }
185    }
186
187    /// Determine if a character is a valid starting character for an identifier.
188    fn is_identifier_start(&self, c: char) -> bool {
189        c.is_alphabetic() || c == '_' || Self::is_unicode_identifier_part(c)
190    }
191
192    /// Determine if a character is a valid part of an identifier.
193    fn is_identifier_part(&self, c: char) -> bool {
194        c.is_alphanumeric() || c == '_' || Self::is_unicode_identifier_part(c)
195    }
196
197    /// Determine whether it is a supported Unicode identifier character.
198    fn is_unicode_identifier_part(c: char) -> bool {
199        (c >= '\u{4E00}' && c <= '\u{9FFF}')
200            || (c >= '\u{3040}' && c <= '\u{309F}')
201            || (c >= '\u{30A0}' && c <= '\u{30FF}')
202            || (c >= '\u{AC00}' && c <= '\u{D7AF}')
203            || (c >= '\u{1F600}' && c <= '\u{1F64F}')
204            || (c >= '\u{1F300}' && c <= '\u{1F5FF}')
205            || (c >= '\u{1F680}' && c <= '\u{1F6FF}')
206            || (c >= '\u{2600}' && c <= '\u{26FF}')
207    }
208
209    /// Determine if a character is a valid starting character for a function name.
210    fn is_function_name_start(&self, c: char) -> bool {
211        c.is_ascii_alphabetic()
212    }
213
214    /// Determine if a character is a valid part of a function name.
215    fn is_function_name_part(&self, c: char) -> bool {
216        c.is_ascii_alphanumeric() || c == '_'
217    }
218
219    /// Get the next token.
220    pub fn next_token(&mut self) -> Result<Token, LexerError> {
221        self.skip_whitespace();
222
223        if self.current_char.is_none() {
224            return Ok(Token::EOF);
225        }
226
227        match self.current_char.unwrap() {
228            '>' => {
229                self.read_char();
230                Ok(Token::Pipeline)
231            }
232            ',' => {
233                self.read_char();
234                Ok(Token::Comma)
235            }
236            ':' => {
237                self.read_char();
238                Ok(Token::Colon)
239            }
240            '|' => {
241                self.read_char();
242                Ok(Token::Union)
243            }
244            '^' => {
245                self.read_char();
246                Ok(Token::Difference)
247            }
248            '&' => {
249                self.read_char();
250                Ok(Token::Intersection)
251            }
252            '@' => self.read_function(),
253            '"' => self.read_quoted_string(),
254            '~' => {
255                self.read_char();
256                Ok(Token::Tilde)
257            }
258            '(' => {
259                self.read_char();
260                Ok(Token::LeftParen)
261            }
262            ')' => {
263                self.read_char();
264                Ok(Token::RightParen)
265            }
266            '[' => {
267                self.read_char();
268                Ok(Token::LeftBracket)
269            }
270            ']' => {
271                self.read_char();
272                Ok(Token::RightBracket)
273            }
274            '.' => {
275                if self.chars[self.read_position].is_ascii_digit() {
276                    self.read_number(true)
277                } else {
278                    self.read_char();
279                    Ok(Token::Dot)
280                }
281            }
282            '-' => {
283                self.read_char();
284                Ok(Token::Minus)
285            }
286            '0'..='9' => self.read_number(false),
287            '#' => {
288                self.read_char();
289                Ok(Token::Pound)
290            }
291            _ => self.read_string(),
292        }
293    }
294
295    /// Read numbers.
296    fn read_number(&mut self, has_dot: bool) -> Result<Token, LexerError> {
297        let start_position = self.position;
298
299        let mut has_dot_ = has_dot;
300        let mut end = 0;
301
302        if has_dot_ {
303            self.read_char();
304        }
305
306        while let Some(c) = self.current_char {
307            if c.is_ascii_digit() {
308                self.read_char();
309                end = self.position;
310            } else if c == '.' {
311                if !has_dot_ {
312                    has_dot_ = true;
313                    self.read_char();
314                    end = self.position;
315                } else {
316                    self.read_char();
317                    while let Some(c1) = self.current_char {
318                        if c1.is_ascii_digit() {
319                            self.read_char();
320                        } else {
321                            break;
322                        }
323                    }
324                }
325            } else {
326                break;
327            }
328        }
329
330        if has_dot_ {
331            let number_str: String = self.chars[start_position..end].iter().collect();
332            match number_str.parse::<f64>() {
333                Ok(float) => Ok(Token::Float(float)),
334                Err(_) => Err(LexerError {
335                    message: format!("Unable to resolve the float: {}", number_str),
336                    line: self.line,
337                    column: self.column,
338                }),
339            }
340        } else {
341            let number_str: String = self.chars[start_position..self.position].iter().collect();
342            match number_str.parse::<usize>() {
343                Ok(number) => Ok(Token::Number(number)),
344                Err(_) => Err(LexerError {
345                    message: format!("Unable to resolve the number: {}", number_str),
346                    line: self.line,
347                    column: self.column,
348                }),
349            }
350        }
351    }
352
353    /// Read identifiers (keywords such as class, id, etc.)
354    #[deprecated(note = "Keyword analysis has been added to the function for reading argument.")]
355    #[allow(dead_code)]
356    fn read_identifier(&mut self) -> Result<Token, LexerError> {
357        let start_position = self.position;
358
359        while let Some(c) = self.current_char {
360            if self.is_identifier_part(c) {
361                self.read_char();
362            } else {
363                break;
364            }
365        }
366
367        let identifier: String = self.chars[start_position..self.position].iter().collect();
368
369        match identifier.as_str() {
370            "class" => Ok(Token::Class),
371            "id" => Ok(Token::Id),
372            "tag" => Ok(Token::Tag),
373            "attr" => Ok(Token::Attr),
374            "text" => Ok(Token::Text),
375            "src" => Ok(Token::Src),
376            "href" => Ok(Token::Href),
377            _ => Err(LexerError {
378                message: "Illegal identifier".to_string(),
379                line: self.line,
380                column: self.column,
381            }),
382        }
383    }
384
385    /// Read the function name (the part after @)
386    fn read_function(&mut self) -> Result<Token, LexerError> {
387        self.read_char();
388
389        let start_position = self.position;
390
391        if let Some(c) = self.current_char {
392            if !self.is_function_name_start(c) {
393                return Err(LexerError {
394                    message: "Function names must start with a letter.".to_string(),
395                    line: self.line,
396                    column: self.column,
397                });
398            }
399        } else {
400            return Err(LexerError {
401                message: "Function name cannot be empty.".to_string(),
402                line: self.line,
403                column: self.column,
404            });
405        }
406
407        while let Some(c) = self.current_char {
408            if self.is_function_name_part(c) {
409                self.read_char();
410            } else {
411                break;
412            }
413        }
414
415        let function_name: String = self.chars[start_position..self.position].iter().collect();
416
417        Ok(Token::Function(function_name))
418    }
419
420    /// Read a quoted string.
421    fn read_quoted_string(&mut self) -> Result<Token, LexerError> {
422        self.read_char();
423
424        let mut value = String::new();
425        let mut escaped = false;
426
427        while let Some(c) = self.current_char {
428            if escaped {
429                match c {
430                    '"' => value.push('"'),
431                    '\\' => value.push('\\'),
432                    'n' => value.push('\n'),
433                    't' => value.push('\t'),
434                    'r' => value.push('\r'),
435                    'u' => {
436                        let mut unicode_value = String::new();
437                        for _ in 0..4 {
438                            self.read_char();
439                            if let Some(hex_char) = self.current_char {
440                                if hex_char.is_ascii_hexdigit() {
441                                    unicode_value.push(hex_char);
442                                } else {
443                                    return Err(LexerError {
444                                        message: format!(
445                                            "Invalid Unicode escape sequence: \\u{}",
446                                            unicode_value
447                                        ),
448                                        line: self.line,
449                                        column: self.column,
450                                    });
451                                }
452                            } else {
453                                return Err(LexerError {
454                                    message: "Unfinished Unicode escape sequence.".to_string(),
455                                    line: self.line,
456                                    column: self.column,
457                                });
458                            }
459                        }
460
461                        // Convert hexadecimal values to Unicode characters
462                        if let Ok(code_point) = u32::from_str_radix(&unicode_value, 16) {
463                            if let Some(unicode_char) = std::char::from_u32(code_point) {
464                                value.push(unicode_char);
465                            } else {
466                                return Err(LexerError {
467                                    message: format!(
468                                        "Invalid Unicode code point: U+{}",
469                                        unicode_value
470                                    ),
471                                    line: self.line,
472                                    column: self.column,
473                                });
474                            }
475                        } else {
476                            return Err(LexerError {
477                                message: format!(
478                                    "Unable to resolve Unicode escape sequence: \\u{}",
479                                    unicode_value
480                                ),
481                                line: self.line,
482                                column: self.column,
483                            });
484                        }
485                    }
486                    _ => value.push(c),
487                }
488                escaped = false;
489                self.read_char();
490            } else if c == '\\' {
491                escaped = true;
492                self.read_char();
493            } else if c == '"' {
494                self.read_char();
495                return Ok(Token::String(value));
496            } else {
497                value.push(c);
498                self.read_char();
499            }
500        }
501
502        Err(LexerError {
503            message: "Unterminated string.".to_string(),
504            line: self.line,
505            column: self.column,
506        })
507    }
508
509    /// Read normal parameters
510    fn read_string(&mut self) -> Result<Token, LexerError> {
511        let start_position = self.position;
512
513        while let Some(c) = self.current_char {
514            if c.is_whitespace() || c == '>' || c == ',' || c == '"' || c == '@' || c == ':' {
515                break;
516            }
517            self.read_char();
518        }
519
520        let argument: String = self.chars[start_position..self.position].iter().collect();
521
522        if argument.is_empty() {
523            return Err(LexerError {
524                message: format!("Unrecognized characters: {:?}", self.current_char),
525                line: self.line,
526                column: self.column,
527            });
528        }
529
530        // check if it is a keyword
531        match argument.as_str() {
532            "class" => Ok(Token::Class),
533            "id" => Ok(Token::Id),
534            "tag" => Ok(Token::Tag),
535            "attr" => Ok(Token::Attr),
536            "text" => Ok(Token::Text),
537            "src" => Ok(Token::Src),
538            "href" => Ok(Token::Href),
539            "true" => Ok(Token::Bool(true)),
540            "false" => Ok(Token::Bool(false)),
541            "nil" => Ok(Token::Nil),
542            _ => Ok(Token::String(argument)),
543        }
544    }
545
546    fn recover_from_error(&mut self) {
547        while let Some(c) = self.current_char {
548            if c == '>' || c == ',' || c == '"' || c == '@' || self.is_identifier_start(c) {
549                break;
550            }
551            self.read_char();
552        }
553    }
554}
555
556pub fn tokenize(input: &str) -> Vec<(Token, usize, usize)> {
557    let mut lexer = Lexer::new(input);
558
559    let estimated_tokens = (input.len() / 4).max(8);
560    let mut tokens_with_pos = Vec::with_capacity(estimated_tokens);
561
562    loop {
563        let line = lexer.line;
564        let column = lexer.column;
565
566        match lexer.next_token() {
567            Ok(Token::EOF) => {
568                tokens_with_pos.push((Token::EOF, line, column));
569                break;
570            }
571            Ok(token) => tokens_with_pos.push((token, line, column)),
572            Err(_) => lexer.recover_from_error(),
573        }
574    }
575    tokens_with_pos
576}