oak_css/lexer/
mod.rs

1use crate::{kind::CssSyntaxKind, language::CssLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, SourceText, lexer::LexOutput, source::Source};
3
4type State<'input> = LexerState<&'input SourceText, CssLanguage>;
5
6pub struct CssLexer;
7
8impl CssLexer {
9    pub fn new(_config: CssLanguage) -> Self {
10        Self
11    }
12
13    /// 跳过空白字符
14    fn skip_whitespace(&self, state: &mut State<'_>) -> bool {
15        let start_pos = state.get_position();
16
17        while let Some(ch) = state.peek() {
18            if ch == ' ' || ch == '\t' {
19                state.advance(ch.len_utf8());
20            }
21            else {
22                break;
23            }
24        }
25
26        if state.get_position() > start_pos {
27            state.add_token(CssSyntaxKind::Whitespace, start_pos, state.get_position());
28            true
29        }
30        else {
31            false
32        }
33    }
34
35    /// 处理换行
36    fn lex_newline(&self, state: &mut State<'_>) -> bool {
37        let start_pos = state.get_position();
38
39        if let Some('\n') = state.peek() {
40            state.advance(1);
41            state.add_token(CssSyntaxKind::Newline, start_pos, state.get_position());
42            true
43        }
44        else if let Some('\r') = state.peek() {
45            state.advance(1);
46            if let Some('\n') = state.peek() {
47                state.advance(1);
48            }
49            state.add_token(CssSyntaxKind::Newline, start_pos, state.get_position());
50            true
51        }
52        else {
53            false
54        }
55    }
56
57    /// 处理注释
58    fn lex_comment(&self, state: &mut State<'_>) -> bool {
59        let start_pos = state.get_position();
60
61        if let Some('/') = state.peek() {
62            if let Some('*') = state.peek_next_n(1) {
63                state.advance(2); // Skip /*
64
65                while let Some(ch) = state.peek() {
66                    if ch == '*' && state.peek_next_n(1) == Some('/') {
67                        state.advance(2); // Skip */
68                        break;
69                    }
70                    state.advance(ch.len_utf8());
71                }
72
73                state.add_token(CssSyntaxKind::Comment, start_pos, state.get_position());
74                true
75            }
76            else {
77                false
78            }
79        }
80        else {
81            false
82        }
83    }
84
85    /// 处理字符串字面量
86    fn lex_string(&self, state: &mut State<'_>) -> bool {
87        let start_pos = state.get_position();
88
89        if let Some(quote) = state.peek() {
90            if quote == '"' || quote == '\'' {
91                state.advance(1); // Skip opening quote
92
93                while let Some(ch) = state.peek() {
94                    if ch == quote {
95                        state.advance(1); // Skip closing quote
96                        break;
97                    }
98                    else if ch == '\\' {
99                        state.advance(1); // Skip escape character
100                        if state.peek().is_some() {
101                            state.advance(state.peek().unwrap().len_utf8()); // Skip escaped character
102                        }
103                    }
104                    else {
105                        state.advance(ch.len_utf8());
106                    }
107                }
108
109                state.add_token(CssSyntaxKind::StringLiteral, start_pos, state.get_position());
110                true
111            }
112            else {
113                false
114            }
115        }
116        else {
117            false
118        }
119    }
120
121    /// 处理数字字面
122    fn lex_number(&self, state: &mut State<'_>) -> bool {
123        let start_pos = state.get_position();
124
125        if let Some(ch) = state.peek() {
126            if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
127                // Integer part
128                while let Some(ch) = state.peek() {
129                    if ch.is_ascii_digit() {
130                        state.advance(1);
131                    }
132                    else {
133                        break;
134                    }
135                }
136
137                // Decimal part
138                if let Some('.') = state.peek() {
139                    state.advance(1);
140                    while let Some(ch) = state.peek() {
141                        if ch.is_ascii_digit() {
142                            state.advance(1);
143                        }
144                        else {
145                            break;
146                        }
147                    }
148                }
149
150                // Exponent part
151                if let Some(ch) = state.peek() {
152                    if ch == 'e' || ch == 'E' {
153                        state.advance(1);
154                        if let Some(sign) = state.peek() {
155                            if sign == '+' || sign == '-' {
156                                state.advance(1);
157                            }
158                        }
159                        while let Some(ch) = state.peek() {
160                            if ch.is_ascii_digit() {
161                                state.advance(1);
162                            }
163                            else {
164                                break;
165                            }
166                        }
167                    }
168                }
169
170                // Check for units
171                while let Some(ch) = state.peek() {
172                    if ch.is_alphabetic() || ch == '%' {
173                        state.advance(ch.len_utf8());
174                    }
175                    else {
176                        break;
177                    }
178                }
179
180                state.add_token(CssSyntaxKind::NumberLiteral, start_pos, state.get_position());
181                true
182            }
183            else {
184                false
185            }
186        }
187        else {
188            false
189        }
190    }
191
192    /// 处理颜色字面
193    fn lex_color(&self, state: &mut State<'_>) -> bool {
194        let start_pos = state.get_position();
195
196        if let Some('#') = state.peek() {
197            state.advance(1); // Skip #
198
199            let mut hex_count = 0;
200            while let Some(ch) = state.peek() {
201                if ch.is_ascii_hexdigit() && hex_count < 8 {
202                    state.advance(1);
203                    hex_count += 1;
204                }
205                else {
206                    break;
207                }
208            }
209
210            let token_kind = if matches!(hex_count, 3 | 4 | 6 | 8) { CssSyntaxKind::ColorLiteral } else { CssSyntaxKind::Hash };
211
212            state.add_token(token_kind, start_pos, state.get_position());
213            true
214        }
215        else {
216            false
217        }
218    }
219
220    /// 处理 URL 字面
221    fn lex_url(&self, state: &mut State<'_>) -> bool {
222        let start_pos = state.get_position();
223
224        if let Some('u') = state.peek() {
225            if state.peek_next_n(1) == Some('r') && state.peek_next_n(2) == Some('l') && state.peek_next_n(3) == Some('(') {
226                state.advance(4); // Skip "url("
227
228                // Skip whitespace
229                while let Some(ch) = state.peek() {
230                    if ch.is_whitespace() {
231                        state.advance(ch.len_utf8());
232                    }
233                    else {
234                        break;
235                    }
236                }
237
238                // Handle quoted or unquoted URL
239                if let Some(quote) = state.peek() {
240                    if quote == '"' || quote == '\'' {
241                        self.lex_string(state);
242                    }
243                    else {
244                        while let Some(ch) = state.peek() {
245                            if ch == ')' || ch.is_whitespace() {
246                                break;
247                            }
248                            state.advance(ch.len_utf8());
249                        }
250                    }
251                }
252
253                // Skip whitespace
254                while let Some(ch) = state.peek() {
255                    if ch.is_whitespace() {
256                        state.advance(ch.len_utf8());
257                    }
258                    else {
259                        break;
260                    }
261                }
262
263                // Skip closing )
264                if let Some(')') = state.peek() {
265                    state.advance(1);
266                }
267
268                state.add_token(CssSyntaxKind::UrlLiteral, start_pos, state.get_position());
269                true
270            }
271            else {
272                false
273            }
274        }
275        else {
276            false
277        }
278    }
279
280    /// 处理标识
281    fn lex_identifier(&self, state: &mut State<'_>) -> bool {
282        let start_pos = state.get_position();
283
284        if let Some(ch) = state.peek() {
285            if ch.is_alphabetic() || ch == '_' || ch == '-' {
286                while let Some(ch) = state.peek() {
287                    if ch.is_alphanumeric() || ch == '_' || ch == '-' {
288                        state.advance(ch.len_utf8());
289                    }
290                    else {
291                        break;
292                    }
293                }
294
295                state.add_token(CssSyntaxKind::Identifier, start_pos, state.get_position());
296                true
297            }
298            else {
299                false
300            }
301        }
302        else {
303            false
304        }
305    }
306
307    /// 处理 at-rule
308    fn lex_at_rule(&self, state: &mut State<'_>, source: &SourceText) -> bool {
309        let start_pos = state.get_position();
310
311        if let Some('@') = state.peek() {
312            state.advance(1); // Skip @
313
314            let rule_start = state.get_position();
315            while let Some(ch) = state.peek() {
316                if ch.is_alphabetic() || ch == '-' {
317                    state.advance(ch.len_utf8());
318                }
319                else {
320                    break;
321                }
322            }
323
324            let rule_name = source.get_text_in((rule_start..state.get_position()).into());
325            let token_kind = match rule_name {
326                "import" => CssSyntaxKind::AtImport,
327                "media" => CssSyntaxKind::AtMedia,
328                "keyframes" => CssSyntaxKind::AtKeyframes,
329                "font-face" => CssSyntaxKind::AtFontFace,
330                "charset" => CssSyntaxKind::AtCharset,
331                "namespace" => CssSyntaxKind::AtNamespace,
332                "supports" => CssSyntaxKind::AtSupports,
333                "page" => CssSyntaxKind::AtPage,
334                "document" => CssSyntaxKind::AtDocument,
335                _ => CssSyntaxKind::AtRule,
336            };
337
338            state.add_token(token_kind, start_pos, state.get_position());
339            true
340        }
341        else {
342            false
343        }
344    }
345
346    /// 处理操作
347    fn lex_operator(&self, state: &mut State<'_>) -> bool {
348        let start_pos = state.get_position();
349
350        if let Some(ch) = state.peek() {
351            let token_kind = match ch {
352                ':' => CssSyntaxKind::Colon,
353                ';' => CssSyntaxKind::Semicolon,
354                ',' => CssSyntaxKind::Comma,
355                '.' => CssSyntaxKind::Dot,
356                '#' => CssSyntaxKind::Hash,
357                '+' => CssSyntaxKind::Plus,
358                '-' => CssSyntaxKind::Minus,
359                '*' => CssSyntaxKind::Star,
360                '/' => CssSyntaxKind::Slash,
361                '=' => CssSyntaxKind::Equals,
362                '~' => CssSyntaxKind::Tilde,
363                '|' => CssSyntaxKind::Pipe,
364                '^' => CssSyntaxKind::Caret,
365                '$' => CssSyntaxKind::Dollar,
366                '>' => CssSyntaxKind::GreaterThan,
367                _ => return false,
368            };
369
370            state.advance(1);
371            state.add_token(token_kind, start_pos, state.get_position());
372            true
373        }
374        else {
375            false
376        }
377    }
378
379    /// 处理分隔
380    fn lex_delimiter(&self, state: &mut State<'_>) -> bool {
381        let start_pos = state.get_position();
382
383        if let Some(ch) = state.peek() {
384            let token_kind = match ch {
385                '(' => CssSyntaxKind::LeftParen,
386                ')' => CssSyntaxKind::RightParen,
387                '{' => CssSyntaxKind::LeftBrace,
388                '}' => CssSyntaxKind::RightBrace,
389                '[' => CssSyntaxKind::LeftBracket,
390                ']' => CssSyntaxKind::RightBracket,
391                _ => return false,
392            };
393
394            state.advance(1);
395            state.add_token(token_kind, start_pos, state.get_position());
396            true
397        }
398        else {
399            false
400        }
401    }
402}
403
404impl Lexer<CssLanguage> for CssLexer {
405    fn lex(&self, source: impl Source) -> LexOutput<CssLanguage> {
406        let source_text = SourceText::new(source.get_text_in((0..source.length()).into()));
407        let mut state = LexerState::new(&source_text);
408
409        while state.not_at_end() {
410            // 尝试各种词法规则
411            if self.skip_whitespace(&mut state) {
412                continue;
413            }
414
415            if self.lex_newline(&mut state) {
416                continue;
417            }
418
419            if self.lex_comment(&mut state) {
420                continue;
421            }
422
423            if self.lex_string(&mut state) {
424                continue;
425            }
426
427            if self.lex_url(&mut state) {
428                continue;
429            }
430
431            if self.lex_color(&mut state) {
432                continue;
433            }
434
435            if self.lex_number(&mut state) {
436                continue;
437            }
438
439            if self.lex_at_rule(&mut state, &source_text) {
440                continue;
441            }
442
443            if self.lex_identifier(&mut state) {
444                continue;
445            }
446
447            if self.lex_delimiter(&mut state) {
448                continue;
449            }
450
451            if self.lex_operator(&mut state) {
452                continue;
453            }
454
455            // 如果所有规则都不匹配,跳过当前字符并标记为错误
456            let start_pos = state.get_position();
457            if let Some(ch) = state.peek() {
458                state.advance(ch.len_utf8());
459                state.add_token(CssSyntaxKind::Error, start_pos, state.get_position());
460            }
461        }
462
463        // 添加 EOF kind
464        let eof_pos = state.get_position();
465        state.add_token(CssSyntaxKind::Eof, eof_pos, eof_pos);
466
467        state.finish(Ok(()))
468    }
469
470    fn lex_incremental(
471        &self,
472        source: impl Source,
473        _changed: usize,
474        _cache: IncrementalCache<CssLanguage>,
475    ) -> LexOutput<CssLanguage> {
476        let source_text = SourceText::new(source.get_text_in((0..source.length()).into()));
477        let mut state = LexerState::new_with_cache(&source_text, _changed, _cache);
478
479        while state.not_at_end() {
480            // 尝试各种词法规则
481            if self.skip_whitespace(&mut state) {
482                continue;
483            }
484
485            if self.lex_newline(&mut state) {
486                continue;
487            }
488
489            if self.lex_comment(&mut state) {
490                continue;
491            }
492
493            if self.lex_string(&mut state) {
494                continue;
495            }
496
497            if self.lex_url(&mut state) {
498                continue;
499            }
500
501            if self.lex_color(&mut state) {
502                continue;
503            }
504
505            if self.lex_number(&mut state) {
506                continue;
507            }
508
509            if self.lex_at_rule(&mut state, &source_text) {
510                continue;
511            }
512
513            if self.lex_identifier(&mut state) {
514                continue;
515            }
516
517            if self.lex_delimiter(&mut state) {
518                continue;
519            }
520
521            if self.lex_operator(&mut state) {
522                continue;
523            }
524
525            // 如果所有规则都不匹配,跳过当前字符并标记为错误
526            let start_pos = state.get_position();
527            if let Some(ch) = state.peek() {
528                state.advance(ch.len_utf8());
529                state.add_token(CssSyntaxKind::Error, start_pos, state.get_position());
530            }
531        }
532
533        // 添加 EOF kind
534        let eof_pos = state.get_position();
535        state.add_token(CssSyntaxKind::Eof, eof_pos, eof_pos);
536
537        state.finish(Ok(()))
538    }
539}