Skip to main content

oak_css/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// CSS token types and role definitions.
3pub mod token_type;
4use crate::language::CssLanguage;
5use oak_core::{Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
6pub use token_type::CssTokenType;
7
8type State<'s, S> = LexerState<'s, S, CssLanguage>;
9
10/// Lexer for the CSS language.
11pub struct CssLexer<'config> {
12    /// Language configuration.
13    config: &'config CssLanguage,
14}
15
16impl<'config> CssLexer<'config> {
17    /// Creates a new `CssLexer` with the given language configuration.
18    pub fn new(config: &'config CssLanguage) -> Self {
19        Self { config }
20    }
21
22    /// Skips whitespace characters.
23    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
24        let start_pos = state.get_position();
25
26        while let Some(ch) = state.peek() {
27            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
28        }
29
30        if state.get_position() > start_pos {
31            state.add_token(CssTokenType::Whitespace, start_pos, state.get_position());
32            true
33        }
34        else {
35            false
36        }
37    }
38
39    /// Handles newline characters.
40    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
41        let start_pos = state.get_position();
42
43        if let Some('\n') = state.peek() {
44            state.advance(1);
45            state.add_token(CssTokenType::Newline, start_pos, state.get_position());
46            true
47        }
48        else if let Some('\r') = state.peek() {
49            state.advance(1);
50            if let Some('\n') = state.peek() {
51                state.advance(1)
52            }
53            state.add_token(CssTokenType::Newline, start_pos, state.get_position());
54            true
55        }
56        else {
57            false
58        }
59    }
60
61    /// Handles CSS comments (`/* ... */`).
62    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
63        let start_pos = state.get_position();
64
65        if let Some('/') = state.peek() {
66            if let Some('*') = state.peek_next_n(1) {
67                state.advance(2); // Skip /*
68
69                while let Some(ch) = state.peek() {
70                    if ch == '*' && state.peek_next_n(1) == Some('/') {
71                        state.advance(2); // Skip */
72                        break;
73                    }
74                    state.advance(ch.len_utf8())
75                }
76
77                state.add_token(CssTokenType::Comment, start_pos, state.get_position());
78                true
79            }
80            else {
81                false
82            }
83        }
84        else {
85            false
86        }
87    }
88
89    /// Handles string literals (both single and double quoted).
90    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91        let start_pos = state.get_position();
92
93        if let Some(quote) = state.peek() {
94            if quote == '"' || quote == '\'' {
95                state.advance(1); // Skip opening quote
96
97                while let Some(ch) = state.peek() {
98                    if ch == quote {
99                        state.advance(1); // Skip closing quote
100                        break;
101                    }
102                    else if ch == '\\' {
103                        state.advance(1); // Skip escape character
104                        if state.peek().is_some() {
105                            state.advance(1)
106                        }
107                    }
108                    else {
109                        state.advance(ch.len_utf8())
110                    }
111                }
112
113                state.add_token(CssTokenType::StringLiteral, start_pos, state.get_position());
114                true
115            }
116            else {
117                false
118            }
119        }
120        else {
121            false
122        }
123    }
124
125    /// Handles CSS URLs (`url(...)`).
126    fn lex_url<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
127        let start_pos = state.get_position();
128
129        if let Some('u') = state.peek() {
130            if state.peek_next_n(1) == Some('r') && state.peek_next_n(2) == Some('l') && state.peek_next_n(3) == Some('(') {
131                state.advance(4); // Skip url(
132
133                // Skip whitespace
134                while let Some(ch) = state.peek() {
135                    if ch.is_whitespace() { state.advance(ch.len_utf8()) } else { break }
136                }
137
138                // Check for quoted or unquoted URL
139                if let Some(quote) = state.peek() {
140                    if quote == '"' || quote == '\'' {
141                        state.advance(1);
142                        while let Some(ch) = state.peek() {
143                            if ch == quote {
144                                state.advance(1);
145                                break;
146                            }
147                            else if ch == '\\' {
148                                state.advance(1);
149                                if state.peek().is_some() {
150                                    state.advance(1)
151                                }
152                            }
153                            else {
154                                state.advance(ch.len_utf8())
155                            }
156                        }
157                    }
158                    else {
159                        while let Some(ch) = state.peek() {
160                            if ch == ')' || ch.is_whitespace() {
161                                break;
162                            }
163                            state.advance(ch.len_utf8())
164                        }
165                    }
166                }
167
168                // Skip whitespace
169                while let Some(ch) = state.peek() {
170                    if ch.is_whitespace() { state.advance(ch.len_utf8()) } else { break }
171                }
172
173                // Skip closing )
174                if let Some(')') = state.peek() {
175                    state.advance(1)
176                }
177
178                state.add_token(CssTokenType::UrlLiteral, start_pos, state.get_position());
179                true
180            }
181            else {
182                false
183            }
184        }
185        else {
186            false
187        }
188    }
189
190    /// Handles color literals (e.g., `#fff`, `#ffffff`).
191    fn lex_color<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
192        let start_pos = state.get_position();
193
194        if let Some('#') = state.peek() {
195            state.advance(1); // Skip #
196
197            let mut count = 0;
198            while let Some(ch) = state.peek() {
199                if ch.is_ascii_hexdigit() {
200                    state.advance(1);
201                    count += 1
202                }
203                else {
204                    break;
205                }
206            }
207
208            if count == 3 || count == 4 || count == 6 || count == 8 {
209                state.add_token(CssTokenType::ColorLiteral, start_pos, state.get_position());
210                true
211            }
212            else {
213                // Not a valid color, but we'll treat it as a hash + something else
214                // This is a simplification for the lexer
215                state.add_token(CssTokenType::Hash, start_pos, start_pos + 1);
216                state.set_position(start_pos + 1);
217                true
218            }
219        }
220        else {
221            false
222        }
223    }
224
225    /// Handles number literals and units (e.g., `10px`, `1.5em`, `100%`).
226    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
227        let start_pos = state.get_position();
228
229        let mut has_digits = false;
230        if let Some(ch) = state.peek() {
231            if ch == '+' || ch == '-' {
232                state.advance(1)
233            }
234        }
235
236        while let Some(ch) = state.peek() {
237            if ch.is_ascii_digit() {
238                state.advance(1);
239                has_digits = true
240            }
241            else {
242                break;
243            }
244        }
245
246        if let Some('.') = state.peek() {
247            if let Some(next_ch) = state.peek_next_n(1) {
248                if next_ch.is_ascii_digit() {
249                    state.advance(1); // Skip .
250                    while let Some(ch) = state.peek() {
251                        if ch.is_ascii_digit() {
252                            state.advance(1);
253                            has_digits = true
254                        }
255                        else {
256                            break;
257                        }
258                    }
259                }
260            }
261        }
262
263        if has_digits {
264            // Check for units
265            let unit_start = state.get_position();
266            while let Some(ch) = state.peek() {
267                if ch.is_alphabetic() || ch == '%' { state.advance(ch.len_utf8()) } else { break }
268            }
269
270            if state.get_position() > unit_start {
271                // We have a number with a unit
272                state.add_token(CssTokenType::NumberLiteral, start_pos, state.get_position())
273            }
274            else {
275                state.add_token(CssTokenType::NumberLiteral, start_pos, state.get_position())
276            }
277            true
278        }
279        else {
280            state.set_position(start_pos);
281            false
282        }
283    }
284
285    /// Handles identifiers (e.g., property names, selectors).
286    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
287        let start_pos = state.get_position();
288
289        if let Some(ch) = state.peek() {
290            if ch.is_alphabetic() || ch == '_' || ch == '-' {
291                while let Some(ch) = state.peek() {
292                    if ch.is_alphanumeric() || ch == '_' || ch == '-' { state.advance(ch.len_utf8()) } else { break }
293                }
294
295                state.add_token(CssTokenType::Identifier, start_pos, state.get_position());
296                true
297            }
298            else {
299                false
300            }
301        }
302        else {
303            false
304        }
305    }
306
307    /// Handles CSS at-rules (e.g., `@import`, `@media`).
308    fn lex_at_rule<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
309        let start_pos = state.get_position();
310
311        if let Some('@') = state.peek() {
312            state.advance(1); // Skip @
313
314            let rule_start = state.get_position();
315            while let Some(ch) = state.peek() {
316                if ch.is_alphabetic() || ch == '-' { state.advance(ch.len_utf8()) } else { break }
317            }
318
319            let rule_name = state.get_text_in((rule_start..state.get_position()).into());
320            let token_kind = match rule_name.as_ref() {
321                "import" => CssTokenType::AtImport,
322                "media" => CssTokenType::AtMedia,
323                "keyframes" => CssTokenType::AtKeyframes,
324                "font-face" => CssTokenType::AtFontFace,
325                "charset" => CssTokenType::AtCharset,
326                "namespace" => CssTokenType::AtNamespace,
327                "supports" => CssTokenType::AtSupports,
328                "page" => CssTokenType::AtPage,
329                "document" => CssTokenType::AtDocument,
330                _ => CssTokenType::AtRule,
331            };
332
333            state.add_token(token_kind, start_pos, state.get_position());
334            true
335        }
336        else {
337            false
338        }
339    }
340
341    /// Handles delimiters (e.g., `(`, `)`, `{`, `}`, `,`, `;`).
342    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
343        let start_pos = state.get_position();
344
345        if let Some(ch) = state.peek() {
346            let token_kind = match ch {
347                '(' => CssTokenType::LeftParen,
348                ')' => CssTokenType::RightParen,
349                '{' => CssTokenType::LeftBrace,
350                '}' => CssTokenType::RightBrace,
351                '[' => CssTokenType::LeftBracket,
352                ']' => CssTokenType::RightBracket,
353                ',' => CssTokenType::Comma,
354                ';' => CssTokenType::Semicolon,
355                _ => return false,
356            };
357
358            state.advance(1);
359            state.add_token(token_kind, start_pos, state.get_position());
360            true
361        }
362        else {
363            false
364        }
365    }
366
367    /// Handles operators (e.g., `:`, `.`, `>`, `+`, `~`, `*`, `/`).
368    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
369        let start_pos = state.get_position();
370
371        if let Some(ch) = state.peek() {
372            let token_kind = match ch {
373                ':' => CssTokenType::Colon,
374                '.' => CssTokenType::Dot,
375                '#' => CssTokenType::Hash,
376                '+' => CssTokenType::Plus,
377                '-' => CssTokenType::Minus,
378                '*' => CssTokenType::Star,
379                '/' => CssTokenType::Slash,
380                '=' => CssTokenType::Equals,
381                '~' => CssTokenType::Tilde,
382                '|' => CssTokenType::Pipe,
383                '^' => CssTokenType::Caret,
384                '$' => CssTokenType::Dollar,
385                '>' => CssTokenType::GreaterThan,
386                _ => return false,
387            };
388
389            state.advance(1);
390            state.add_token(token_kind, start_pos, state.get_position());
391            true
392        }
393        else {
394            false
395        }
396    }
397
398    /// Main entry point for the lexer.
399    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
400        while state.not_at_end() {
401            let safe_point = state.get_position();
402
403            // Try various lexing rules
404            if self.skip_whitespace(state) {
405                continue;
406            }
407
408            if self.lex_newline(state) {
409                continue;
410            }
411
412            if self.lex_comment(state) {
413                continue;
414            }
415
416            if self.lex_string(state) {
417                continue;
418            }
419
420            if self.lex_url(state) {
421                continue;
422            }
423
424            if self.lex_color(state) {
425                continue;
426            }
427
428            if self.lex_number(state) {
429                continue;
430            }
431
432            if self.lex_at_rule(state) {
433                continue;
434            }
435
436            if self.lex_identifier(state) {
437                continue;
438            }
439
440            if self.lex_delimiter(state) {
441                continue;
442            }
443
444            if self.lex_operator(state) {
445                continue;
446            }
447
448            // If no rules match, skip the current character and mark as error
449            let start_pos = state.get_position();
450            if let Some(ch) = state.peek() {
451                state.advance(ch.len_utf8());
452                state.add_token(CssTokenType::Error, start_pos, state.get_position())
453            }
454            else {
455                break;
456            }
457
458            state.advance_if_dead_lock(safe_point)
459        }
460        Ok(())
461    }
462}
463
464impl<'config> Lexer<CssLanguage> for CssLexer<'config> {
465    /// Tokenizes the source code into a stream of CSS tokens.
466    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], mut cache: &'a mut impl oak_core::lexer::LexerCache<CssLanguage>) -> LexOutput<CssLanguage> {
467        let mut state = LexerState::new(source);
468        let result = self.run(&mut state);
469        if result.is_ok() {
470            state.add_eof()
471        }
472        state.finish_with_cache(result, &mut cache)
473    }
474}