Skip to main content

oak_css/lexer/
mod.rs

1pub mod token_type;
2use crate::language::CssLanguage;
3use oak_core::{Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
4pub use token_type::CssTokenType;
5
6type State<'s, S> = LexerState<'s, S, CssLanguage>;
7
8pub struct CssLexer<'config> {
9    _config: &'config CssLanguage,
10}
11
12impl<'config> CssLexer<'config> {
13    pub fn new(config: &'config CssLanguage) -> Self {
14        Self { _config: config }
15    }
16
17    /// 跳过空白字符
18    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
19        let start_pos = state.get_position();
20
21        while let Some(ch) = state.peek() {
22            if ch == ' ' || ch == '\t' {
23                state.advance(ch.len_utf8());
24            }
25            else {
26                break;
27            }
28        }
29
30        if state.get_position() > start_pos {
31            state.add_token(CssTokenType::Whitespace, start_pos, state.get_position());
32            true
33        }
34        else {
35            false
36        }
37    }
38
39    /// 处理换行
40    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
41        let start_pos = state.get_position();
42
43        if let Some('\n') = state.peek() {
44            state.advance(1);
45            state.add_token(CssTokenType::Newline, start_pos, state.get_position());
46            true
47        }
48        else if let Some('\r') = state.peek() {
49            state.advance(1);
50            if let Some('\n') = state.peek() {
51                state.advance(1);
52            }
53            state.add_token(CssTokenType::Newline, start_pos, state.get_position());
54            true
55        }
56        else {
57            false
58        }
59    }
60
61    /// 处理注释
62    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
63        let start_pos = state.get_position();
64
65        if let Some('/') = state.peek() {
66            if let Some('*') = state.peek_next_n(1) {
67                state.advance(2); // Skip /*
68
69                while let Some(ch) = state.peek() {
70                    if ch == '*' && state.peek_next_n(1) == Some('/') {
71                        state.advance(2); // Skip */
72                        break;
73                    }
74                    state.advance(ch.len_utf8());
75                }
76
77                state.add_token(CssTokenType::Comment, start_pos, state.get_position());
78                true
79            }
80            else {
81                false
82            }
83        }
84        else {
85            false
86        }
87    }
88
89    /// 处理字符串字面量
90    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91        let start_pos = state.get_position();
92
93        if let Some(quote) = state.peek() {
94            if quote == '"' || quote == '\'' {
95                state.advance(1); // Skip opening quote
96
97                while let Some(ch) = state.peek() {
98                    if ch == quote {
99                        state.advance(1); // Skip closing quote
100                        break;
101                    }
102                    else if ch == '\\' {
103                        state.advance(1); // Skip escape character
104                        if state.peek().is_some() {
105                            state.advance(1);
106                        }
107                    }
108                    else {
109                        state.advance(ch.len_utf8());
110                    }
111                }
112
113                state.add_token(CssTokenType::StringLiteral, start_pos, state.get_position());
114                true
115            }
116            else {
117                false
118            }
119        }
120        else {
121            false
122        }
123    }
124
125    /// 处理 URL
126    fn lex_url<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
127        let start_pos = state.get_position();
128
129        if let Some('u') = state.peek() {
130            if state.peek_next_n(1) == Some('r') && state.peek_next_n(2) == Some('l') && state.peek_next_n(3) == Some('(') {
131                state.advance(4); // Skip url(
132
133                // Skip whitespace
134                while let Some(ch) = state.peek() {
135                    if ch.is_whitespace() {
136                        state.advance(ch.len_utf8());
137                    }
138                    else {
139                        break;
140                    }
141                }
142
143                // Check for quoted or unquoted URL
144                if let Some(quote) = state.peek() {
145                    if quote == '"' || quote == '\'' {
146                        state.advance(1);
147                        while let Some(ch) = state.peek() {
148                            if ch == quote {
149                                state.advance(1);
150                                break;
151                            }
152                            else if ch == '\\' {
153                                state.advance(1);
154                                if state.peek().is_some() {
155                                    state.advance(1);
156                                }
157                            }
158                            else {
159                                state.advance(ch.len_utf8());
160                            }
161                        }
162                    }
163                    else {
164                        while let Some(ch) = state.peek() {
165                            if ch == ')' || ch.is_whitespace() {
166                                break;
167                            }
168                            state.advance(ch.len_utf8());
169                        }
170                    }
171                }
172
173                // Skip whitespace
174                while let Some(ch) = state.peek() {
175                    if ch.is_whitespace() {
176                        state.advance(ch.len_utf8());
177                    }
178                    else {
179                        break;
180                    }
181                }
182
183                // Skip closing )
184                if let Some(')') = state.peek() {
185                    state.advance(1);
186                }
187
188                state.add_token(CssTokenType::UrlLiteral, start_pos, state.get_position());
189                true
190            }
191            else {
192                false
193            }
194        }
195        else {
196            false
197        }
198    }
199
200    /// 处理颜色字面量
201    fn lex_color<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
202        let start_pos = state.get_position();
203
204        if let Some('#') = state.peek() {
205            state.advance(1); // Skip #
206
207            let mut count = 0;
208            while let Some(ch) = state.peek() {
209                if ch.is_ascii_hexdigit() {
210                    state.advance(1);
211                    count += 1;
212                }
213                else {
214                    break;
215                }
216            }
217
218            if count == 3 || count == 4 || count == 6 || count == 8 {
219                state.add_token(CssTokenType::ColorLiteral, start_pos, state.get_position());
220                true
221            }
222            else {
223                // Not a valid color, but we'll treat it as a hash + something else
224                // This is a simplification for the lexer
225                state.add_token(CssTokenType::Hash, start_pos, start_pos + 1);
226                state.set_position(start_pos + 1);
227                true
228            }
229        }
230        else {
231            false
232        }
233    }
234
235    /// 处理数字字面量
236    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
237        let start_pos = state.get_position();
238
239        let mut has_digits = false;
240        if let Some(ch) = state.peek() {
241            if ch == '+' || ch == '-' {
242                state.advance(1);
243            }
244        }
245
246        while let Some(ch) = state.peek() {
247            if ch.is_ascii_digit() {
248                state.advance(1);
249                has_digits = true;
250            }
251            else {
252                break;
253            }
254        }
255
256        if let Some('.') = state.peek() {
257            if let Some(next_ch) = state.peek_next_n(1) {
258                if next_ch.is_ascii_digit() {
259                    state.advance(1); // Skip .
260                    while let Some(ch) = state.peek() {
261                        if ch.is_ascii_digit() {
262                            state.advance(1);
263                            has_digits = true;
264                        }
265                        else {
266                            break;
267                        }
268                    }
269                }
270            }
271        }
272
273        if has_digits {
274            // Check for units
275            let unit_start = state.get_position();
276            while let Some(ch) = state.peek() {
277                if ch.is_alphabetic() || ch == '%' {
278                    state.advance(ch.len_utf8());
279                }
280                else {
281                    break;
282                }
283            }
284
285            if state.get_position() > unit_start {
286                // We have a number with a unit
287                state.add_token(CssTokenType::NumberLiteral, start_pos, state.get_position());
288            }
289            else {
290                state.add_token(CssTokenType::NumberLiteral, start_pos, state.get_position());
291            }
292            true
293        }
294        else {
295            state.set_position(start_pos);
296            false
297        }
298    }
299
300    /// 处理标识符
301    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
302        let start_pos = state.get_position();
303
304        if let Some(ch) = state.peek() {
305            if ch.is_alphabetic() || ch == '_' || ch == '-' {
306                while let Some(ch) = state.peek() {
307                    if ch.is_alphanumeric() || ch == '_' || ch == '-' {
308                        state.advance(ch.len_utf8());
309                    }
310                    else {
311                        break;
312                    }
313                }
314
315                state.add_token(CssTokenType::Identifier, start_pos, state.get_position());
316                true
317            }
318            else {
319                false
320            }
321        }
322        else {
323            false
324        }
325    }
326
327    /// 处理 at-rule
328    fn lex_at_rule<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
329        let start_pos = state.get_position();
330
331        if let Some('@') = state.peek() {
332            state.advance(1); // Skip @
333
334            let rule_start = state.get_position();
335            while let Some(ch) = state.peek() {
336                if ch.is_alphabetic() || ch == '-' {
337                    state.advance(ch.len_utf8());
338                }
339                else {
340                    break;
341                }
342            }
343
344            let rule_name = state.get_text_in((rule_start..state.get_position()).into());
345            let token_kind = match rule_name.as_ref() {
346                "import" => CssTokenType::AtImport,
347                "media" => CssTokenType::AtMedia,
348                "keyframes" => CssTokenType::AtKeyframes,
349                "font-face" => CssTokenType::AtFontFace,
350                "charset" => CssTokenType::AtCharset,
351                "namespace" => CssTokenType::AtNamespace,
352                "supports" => CssTokenType::AtSupports,
353                "page" => CssTokenType::AtPage,
354                "document" => CssTokenType::AtDocument,
355                _ => CssTokenType::AtRule,
356            };
357
358            state.add_token(token_kind, start_pos, state.get_position());
359            true
360        }
361        else {
362            false
363        }
364    }
365
366    /// 处理分隔符
367    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
368        let start_pos = state.get_position();
369
370        if let Some(ch) = state.peek() {
371            let token_kind = match ch {
372                '(' => CssTokenType::LeftParen,
373                ')' => CssTokenType::RightParen,
374                '{' => CssTokenType::LeftBrace,
375                '}' => CssTokenType::RightBrace,
376                '[' => CssTokenType::LeftBracket,
377                ']' => CssTokenType::RightBracket,
378                ',' => CssTokenType::Comma,
379                ';' => CssTokenType::Semicolon,
380                _ => return false,
381            };
382
383            state.advance(1);
384            state.add_token(token_kind, start_pos, state.get_position());
385            true
386        }
387        else {
388            false
389        }
390    }
391
392    /// 处理操作符
393    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
394        let start_pos = state.get_position();
395
396        if let Some(ch) = state.peek() {
397            let token_kind = match ch {
398                ':' => CssTokenType::Colon,
399                '.' => CssTokenType::Dot,
400                '#' => CssTokenType::Hash,
401                '+' => CssTokenType::Plus,
402                '-' => CssTokenType::Minus,
403                '*' => CssTokenType::Star,
404                '/' => CssTokenType::Slash,
405                '=' => CssTokenType::Equals,
406                '~' => CssTokenType::Tilde,
407                '|' => CssTokenType::Pipe,
408                '^' => CssTokenType::Caret,
409                '$' => CssTokenType::Dollar,
410                '>' => CssTokenType::GreaterThan,
411                _ => return false,
412            };
413
414            state.advance(1);
415            state.add_token(token_kind, start_pos, state.get_position());
416            true
417        }
418        else {
419            false
420        }
421    }
422
423    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
424        while state.not_at_end() {
425            let safe_point = state.get_position();
426
427            // 尝试各种词法规则
428            if self.skip_whitespace(state) {
429                continue;
430            }
431
432            if self.lex_newline(state) {
433                continue;
434            }
435
436            if self.lex_comment(state) {
437                continue;
438            }
439
440            if self.lex_string(state) {
441                continue;
442            }
443
444            if self.lex_url(state) {
445                continue;
446            }
447
448            if self.lex_color(state) {
449                continue;
450            }
451
452            if self.lex_number(state) {
453                continue;
454            }
455
456            if self.lex_at_rule(state) {
457                continue;
458            }
459
460            if self.lex_identifier(state) {
461                continue;
462            }
463
464            if self.lex_delimiter(state) {
465                continue;
466            }
467
468            if self.lex_operator(state) {
469                continue;
470            }
471
472            // 如果所有规则都不匹配,跳过当前字符并标记为错误
473            let start_pos = state.get_position();
474            if let Some(ch) = state.peek() {
475                state.advance(ch.len_utf8());
476                state.add_token(CssTokenType::Error, start_pos, state.get_position());
477            }
478            else {
479                break;
480            }
481
482            state.advance_if_dead_lock(safe_point);
483        }
484        Ok(())
485    }
486}
487
488impl<'config> Lexer<CssLanguage> for CssLexer<'config> {
489    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], mut cache: &'a mut impl oak_core::lexer::LexerCache<CssLanguage>) -> LexOutput<CssLanguage> {
490        let mut state = LexerState::new(source);
491        let result = self.run(&mut state);
492        if result.is_ok() {
493            state.add_eof();
494        }
495        state.finish_with_cache(result, &mut cache)
496    }
497}