Skip to main content

azul_simplecss/
tokenizer.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5use crate::stream;
6use crate::stream::Stream;
7use crate::error::Error;
8
9/// CSS combinator.
10#[derive(PartialEq,Debug)]
11pub enum Combinator {
12    /// Descendant selector
13    Space,
14    /// Child selector
15    GreaterThan,
16    /// Adjacent sibling selector
17    Plus,
18    /// General sibling selector
19    Tilde,
20}
21
22/// CSS token.
23#[derive(PartialEq,Debug)]
24pub enum Token<'a> {
25    /// Universal selector
26    ///
27    /// https://www.w3.org/TR/CSS21/selector.html#universal-selector
28    UniversalSelector,
29    /// Type selector
30    ///
31    /// https://www.w3.org/TR/CSS21/selector.html#type-selectors
32    TypeSelector(&'a str),
33    /// ID selector
34    ///
35    /// Value contains ident without `#`.
36    ///
37    /// https://www.w3.org/TR/CSS21/selector.html#id-selectors
38    IdSelector(&'a str),
39    /// Class selector
40    ///
41    /// Value contains ident without `.`.
42    ///
43    /// https://www.w3.org/TR/CSS21/selector.html#class-html
44    ClassSelector(&'a str),
45    /// Attribute selector
46    ///
47    /// We do not parse it's content yet, so value contains everything between `[]`.
48    ///
49    /// https://www.w3.org/TR/CSS21/selector.html#attribute-selectors
50    AttributeSelector(&'a str),
51    /// Pseudo-class
52    ///
53    /// Value contains ident without `:`.
54    /// Selector: `"nth-child"`, value: The thing between the braces - `Some("3")`
55    ///
56    /// https://www.w3.org/TR/CSS21/selector.html#pseudo-class-selectors
57    PseudoClass {
58        /// The selector name (e.g., "nth-child", "hover")
59        selector: &'a str,
60        /// The optional value inside parentheses (e.g., "3" in ":nth-child(3)")
61        value: Option<&'a str>,
62    },
63    /// `Combinator`
64    Combinator(Combinator),
65    /// Rules separator
66    ///
67    /// https://www.w3.org/TR/CSS21/selector.html#grouping
68    Comma,
69    /// Block start
70    ///
71    /// Indicates `{`.
72    ///
73    /// https://www.w3.org/TR/CSS21/syndata.html#rule-sets
74    BlockStart,
75    /// Block end
76    ///
77    /// Indicates `}`.
78    ///
79    /// https://www.w3.org/TR/CSS21/syndata.html#rule-sets
80    BlockEnd,
81    /// Declaration
82    ///
83    /// Contains property name and property value.
84    ///
85    /// https://www.w3.org/TR/CSS21/syndata.html#declaration
86    Declaration(&'a str, &'a str),
87    /// `@` rule (excluding the `@` sign itself). The content is not parsed,
88    /// for example `@keyframes mymove` = `AtRule("keyframes"), AtStr("mymove")`.
89    AtRule(&'a str),
90    /// Raw Str inside of block
91    DeclarationStr(&'a str),
92    /// String following an @rule
93    AtStr(&'a str),
94    /// Same as PseudoClass, but with two colons (`::thing`).
95    DoublePseudoClass {
96        /// The selector name (e.g., "before", "after")
97        selector: &'a str,
98        /// The optional value inside parentheses
99        value: Option<&'a str>,
100    },
101    /// End of stream
102    ///
103    /// Parsing is finished.
104    EndOfStream,
105}
106
107#[derive(PartialEq)]
108enum State {
109    Rule,
110    Declaration,
111    DeclarationRule,
112}
113
114/// CSS tokenizer.
115pub struct Tokenizer<'a> {
116    stream: Stream<'a>,
117    state: State,
118    after_selector: bool,
119    has_at_rule: bool,
120    at_start: bool,
121    /// Track nesting depth for nested @-rules support
122    /// Each entry is true if the block at that level was started by an @-rule
123    nesting_stack: Vec<bool>,
124}
125
126impl<'a> Tokenizer<'a> {
127    /// Constructs a new `Tokenizer`.
128    pub fn new(text: &str) -> Tokenizer<'_> {
129        Tokenizer {
130            stream: Stream::new(text.as_bytes()),
131            state: State::Rule,
132            after_selector: false,
133            has_at_rule: false,
134            at_start: true,
135            nesting_stack: Vec::new(),
136        }
137    }
138
139    /// Constructs a new bounded `Tokenizer`.
140    ///
141    /// It can be useful if CSS data is inside other data, like HTML.
142    /// Using this method you will get an absolute error positions and not relative,
143    /// like when using [`new()`].
144    ///
145    /// [`new()`]: #method.new
146    pub fn new_bound(text: &str, start: usize, end: usize) -> Tokenizer<'_> {
147        Tokenizer {
148            stream: Stream::new_bound(text.as_bytes(), start, end),
149            state: State::Rule,
150            after_selector: false,
151            has_at_rule: false,
152            at_start: true,
153            nesting_stack: Vec::new(),
154        }
155    }
156
157    /// Returns a current position in the text.
158    pub fn pos(&self) -> usize {
159        self.stream.pos()
160    }
161
162    /// Parses a next token.
163    pub fn parse_next(&mut self) -> Result<Token<'a>, Error> {
164        if self.at_start {
165            self.stream.skip_spaces();
166            self.at_start = false;
167        }
168
169        if self.stream.at_end() {
170            return Ok(Token::EndOfStream);
171        }
172
173        match self.state {
174            State::Rule         => self.consume_rule(),
175            State::Declaration  => self.consume_declaration(),
176            State::DeclarationRule => self.consume_declaration(),
177        }
178    }
179
180    fn consume_rule(&mut self) -> Result<Token<'a>, Error> {
181        match self.stream.curr_char_raw() {
182            b'@' => {
183                self.after_selector = true;
184                self.has_at_rule = true;
185                self.stream.advance_raw(1);
186                let s = self.consume_ident()?;
187                
188                // Don't consume parentheses here - let the next parse_next() call handle it
189                // Just return the @rule name
190                return Ok(Token::AtRule(s));
191            }
192            b'#' => {
193                self.after_selector = true;
194                self.has_at_rule = false;
195                self.stream.advance_raw(1);
196                let s = self.consume_ident()?;
197                return Ok(Token::IdSelector(s));
198            }
199            b'.' => {
200                self.after_selector = true;
201                self.has_at_rule = false;
202                self.stream.advance_raw(1);
203                let s = self.consume_ident()?;
204                return Ok(Token::ClassSelector(s));
205            }
206            b'*' => {
207                self.after_selector = true;
208                self.has_at_rule = false;
209                self.stream.advance_raw(1);
210                self.stream.skip_spaces();
211                return Ok(Token::UniversalSelector);
212            }
213            b':' => {
214                self.after_selector = true;
215                self.has_at_rule = false;
216                self.stream.advance_raw(1);
217
218                // Whether this selector is a ::selector.
219                let is_double_colon = self.stream.is_char_eq(b':')?;
220                if is_double_colon {
221                    self.stream.advance_raw(1); // consume the second :
222                }
223
224                let s = self.consume_ident()?;
225
226                if self.stream.curr_char() == Ok(b'(') {
227                    // Item is a thing()
228                    self.stream.advance_raw(1); // (
229                    let inner_len = self.stream.length_to(b')')?;
230                    let inner = self.stream.read_raw_str(inner_len);
231                    self.stream.advance_raw(1); // )
232                    return Ok(if is_double_colon {
233                        Token::DoublePseudoClass { selector: s, value: Some(inner) }
234                    } else {
235                        Token::PseudoClass { selector: s, value: Some(inner) }
236                    });
237                } else {
238                    return Ok(if is_double_colon {
239                        Token::DoublePseudoClass { selector: s, value: None }
240                    } else {
241                        Token::PseudoClass { selector: s, value: None }
242                    });
243                }
244            }
245            b'[' => {
246                self.after_selector = true;
247                self.has_at_rule = false;
248                self.stream.advance_raw(1);
249                let len = self.stream.length_to(b']')?;
250                let s = self.stream.read_raw_str(len);
251                self.stream.advance_raw(1); // ]
252                self.stream.skip_spaces();
253                return Ok(Token::AttributeSelector(s));
254            }
255            b',' => {
256                self.after_selector = false;
257                self.has_at_rule = false;
258                self.stream.advance_raw(1);
259                self.stream.skip_spaces();
260                return Ok(Token::Comma);
261            }
262            b'{' => {
263                // Track if this block was started by an @-rule
264                self.nesting_stack.push(self.has_at_rule);
265                self.after_selector = false;
266                self.has_at_rule = false;
267                self.state = State::Declaration;
268                self.stream.advance_raw(1);
269                return Ok(Token::BlockStart);
270            }
271            b'>' => {
272                if self.after_selector {
273                    self.after_selector = false;
274                    self.has_at_rule = false;
275                    self.stream.advance_raw(1);
276                    self.stream.skip_spaces();
277                    return Ok(Token::Combinator(Combinator::GreaterThan));
278                } else {
279                    return Err(Error::UnknownToken(self.stream.gen_error_pos()));
280                }
281            }
282            b'+' => {
283                if self.after_selector {
284                    self.after_selector = false;
285                    self.has_at_rule = false;
286                    self.stream.advance_raw(1);
287                    self.stream.skip_spaces();
288                    return Ok(Token::Combinator(Combinator::Plus));
289                } else {
290                    return Err(Error::UnknownToken(self.stream.gen_error_pos()));
291                }
292            }
293            b'~' => {
294                if self.after_selector {
295                    self.after_selector = false;
296                    self.has_at_rule = false;
297                    self.stream.advance_raw(1);
298                    self.stream.skip_spaces();
299                    return Ok(Token::Combinator(Combinator::Tilde));
300                } else {
301                    return Err(Error::UnknownToken(self.stream.gen_error_pos()));
302                }
303            }
304            b'/' => {
305                if self.consume_comment()? {
306                    return self.parse_next();
307                } else {
308                    return Err(Error::UnknownToken(self.stream.gen_error_pos()));
309                }
310            }
311            b'(' if self.has_at_rule => {
312                // Parenthesized content in @-rule like @media (min-width: 800px)
313                let s = self.consume_parenthesized_content()?;
314                self.after_selector = true;
315                return Ok(Token::AtStr(s));
316            }
317            _ => {
318                if self.stream.is_space_raw() {
319                    self.stream.skip_spaces();
320
321                    if !self.after_selector {
322                        return self.parse_next();
323                    }
324
325                    match self.stream.curr_char()? {
326                        b'{' | b'/' | b'>' | b'+' | b'~' | b'*' | b'(' => { return self.parse_next(); },
327                        _ => {
328                            self.after_selector = false;
329                            if !self.has_at_rule {
330                                return Ok(Token::Combinator(Combinator::Space));
331                            }
332                        }
333                    }
334                }
335
336                let s = self.consume_ident()?;
337                let token_type = if self.has_at_rule {
338                    self.has_at_rule = true;
339                    Token::AtStr(s)
340                } else {
341                    self.has_at_rule = false;
342                    Token::TypeSelector(s)
343                };
344
345                self.after_selector = true;
346                return Ok(token_type);
347            }
348        }
349    }
350
351    fn consume_declaration(&mut self) -> Result<Token<'a>, Error> {
352        self.stream.skip_spaces();
353
354        match self.stream.curr_char_raw() {
355            b'}' => {
356                // Pop nesting level
357                self.nesting_stack.pop();
358                
359                if self.state == State::DeclarationRule {
360                    self.state = State::Declaration;
361                } else if self.state == State::Declaration {
362                    // Check if we should go back to Declaration or Rule based on nesting
363                    if self.nesting_stack.is_empty() {
364                        self.state = State::Rule;
365                    } else {
366                        // Stay in declaration mode for nested @-rules
367                        self.state = State::Declaration;
368                    }
369                }
370                self.has_at_rule = false;
371                self.stream.advance_raw(1);
372                self.stream.skip_spaces();
373                return Ok(Token::BlockEnd);
374            },
375            b'{' => {
376                // Track if this block was started by an @-rule
377                self.nesting_stack.push(self.has_at_rule);
378                self.has_at_rule = false;
379                
380                if self.state == State::Rule {
381                    self.state = State::Declaration;
382                } else if self.state == State::Declaration {
383                    self.state = State::DeclarationRule;
384                }
385                self.stream.advance_raw(1);
386                self.stream.skip_spaces();
387                return Ok(Token::BlockStart);
388            },
389            b'@' => {
390                // Nested @-rule inside a block (e.g., @media inside @os, or @os inside .class)
391                self.after_selector = true;
392                self.has_at_rule = true;
393                self.stream.advance_raw(1);
394                let s = self.consume_ident()?;
395                self.stream.skip_spaces();
396                return Ok(Token::AtRule(s));
397            },
398            b':' => {
399                // Nested pseudo-class selector (e.g., :hover { } inside .button { })
400                self.after_selector = true;
401                self.has_at_rule = false;
402                self.stream.advance_raw(1);
403
404                // Check for ::pseudo-element
405                let is_double_colon = self.stream.is_char_eq(b':')?;
406                if is_double_colon {
407                    self.stream.advance_raw(1);
408                }
409
410                let s = self.consume_ident()?;
411
412                if self.stream.curr_char() == Ok(b'(') {
413                    self.stream.advance_raw(1); // (
414                    let inner_len = self.stream.length_to(b')')?;
415                    let inner = self.stream.read_raw_str(inner_len);
416                    self.stream.advance_raw(1); // )
417                    return Ok(if is_double_colon {
418                        Token::DoublePseudoClass { selector: s, value: Some(inner) }
419                    } else {
420                        Token::PseudoClass { selector: s, value: Some(inner) }
421                    });
422                } else {
423                    return Ok(if is_double_colon {
424                        Token::DoublePseudoClass { selector: s, value: None }
425                    } else {
426                        Token::PseudoClass { selector: s, value: None }
427                    });
428                }
429            },
430            b'.' => {
431                // Nested class selector (e.g., .inner { } inside .outer { })
432                self.after_selector = true;
433                self.has_at_rule = false;
434                self.stream.advance_raw(1);
435                let s = self.consume_ident()?;
436                return Ok(Token::ClassSelector(s));
437            },
438            b'#' => {
439                // Nested ID selector (e.g., #inner { } inside .outer { })
440                self.after_selector = true;
441                self.has_at_rule = false;
442                self.stream.advance_raw(1);
443                let s = self.consume_ident()?;
444                return Ok(Token::IdSelector(s));
445            },
446            b'*' => {
447                // Nested universal selector
448                self.after_selector = true;
449                self.has_at_rule = false;
450                self.stream.advance_raw(1);
451                self.stream.skip_spaces();
452                return Ok(Token::UniversalSelector);
453            },
454            b'[' => {
455                // Nested attribute selector
456                self.after_selector = true;
457                self.has_at_rule = false;
458                self.stream.advance_raw(1);
459                let len = self.stream.length_to(b']')?;
460                let s = self.stream.read_raw_str(len);
461                self.stream.advance_raw(1); // ]
462                self.stream.skip_spaces();
463                return Ok(Token::AttributeSelector(s));
464            },
465            b'>' => {
466                // Direct child combinator in nested context
467                self.after_selector = false;
468                self.has_at_rule = false;
469                self.stream.advance_raw(1);
470                self.stream.skip_spaces();
471                return Ok(Token::Combinator(Combinator::GreaterThan));
472            },
473            b'+' => {
474                // Adjacent sibling combinator in nested context
475                self.after_selector = false;
476                self.has_at_rule = false;
477                self.stream.advance_raw(1);
478                self.stream.skip_spaces();
479                return Ok(Token::Combinator(Combinator::Plus));
480            },
481            b'~' => {
482                // General sibling combinator in nested context
483                self.after_selector = false;
484                self.has_at_rule = false;
485                self.stream.advance_raw(1);
486                self.stream.skip_spaces();
487                return Ok(Token::Combinator(Combinator::Tilde));
488            },
489            b',' => {
490                // Comma in nested context (multiple selectors)
491                self.after_selector = false;
492                self.has_at_rule = false;
493                self.stream.advance_raw(1);
494                self.stream.skip_spaces();
495                return Ok(Token::Comma);
496            },
497            b'(' if self.has_at_rule => {
498                // Parenthesized content in nested @-rule
499                let s = self.consume_parenthesized_content()?;
500                self.after_selector = true;
501                return Ok(Token::AtStr(s));
502            },
503            b'/' => {
504                if self.consume_comment()? {
505                    return self.parse_next();
506                } else {
507                    return Err(Error::UnknownToken(self.stream.gen_error_pos()));
508                }
509            }
510            _ => {
511                // Check for @-rule content (identifier after @rule like "@media screen")
512                if self.has_at_rule {
513                    let s = self.consume_ident()?;
514                    self.stream.skip_spaces();
515                    self.after_selector = true;
516                    return Ok(Token::AtStr(s));
517                }
518                
519                let name = self.consume_ident()?;
520
521                self.stream.skip_spaces();
522
523                if self.stream.is_char_eq(b'/')? {
524                    if !self.consume_comment()? {
525                        return Err(Error::UnknownToken(self.stream.gen_error_pos()));
526                    }
527                }
528
529                if self.stream.is_char_eq(b'{')? {
530                    // This is a nested type selector (e.g., "div { }" inside ".outer { }")
531                    if name.is_empty() {
532                        return Err(Error::UnknownToken(self.stream.gen_error_pos()));
533                    } else {
534                        self.after_selector = true;
535                        return Ok(Token::TypeSelector(name));
536                    }
537                }
538                
539                // Check for `:` to determine if this is a declaration
540                if !self.stream.is_char_eq(b':')? {
541                    // Not a declaration, might be a type selector in nested context
542                    self.after_selector = true;
543                    return Ok(Token::TypeSelector(name));
544                }
545
546                self.stream.advance_raw(1); // :
547                self.stream.skip_spaces();
548
549                if self.stream.is_char_eq(b'/')? {
550                    if !self.consume_comment()? {
551                        return Err(Error::UnknownToken(self.stream.gen_error_pos()));
552                    }
553                }
554
555                let len = self.stream.length_to_either(&[b';', b'}'])?;
556
557                if len == 0 {
558                    return Err(Error::UnknownToken(self.stream.gen_error_pos()));
559                }
560
561                let mut value = self.stream.read_raw_str(len);
562                // trim spaces at the end of the value
563                if let Some(p) = value.as_bytes().iter().rposition(|c| !stream::is_space(*c)) {
564                    value = &value[0..(p + 1)];
565                }
566
567                self.stream.skip_spaces();
568                while self.stream.is_char_eq(b';')? {
569                    self.stream.advance_raw(1);
570                    self.stream.skip_spaces();
571                }
572
573                Ok(Token::Declaration(name, value))
574            }
575        }
576    }
577
578    fn consume_ident(&mut self) -> Result<&'a str, Error> {
579        let start = self.stream.pos();
580
581        while !self.stream.at_end() {
582            if self.stream.is_ident_raw() {
583                self.stream.advance(1)?;
584            } else {
585                break;
586            }
587        }
588
589        if start == self.stream.pos() {
590            return Err(Error::UnknownToken(self.stream.gen_error_pos()));
591        }
592
593        let s = self.stream.slice_region_raw_str(start, self.stream.pos());
594        Ok(s)
595    }
596
597    fn consume_comment(&mut self) -> Result<bool, Error>  {
598        self.stream.advance_raw(1);
599
600        if self.stream.is_char_eq(b'*')? {
601            self.stream.advance_raw(1); // *
602
603            while !self.stream.at_end() {
604                let len = self.stream.length_to(b'*')?;
605                self.stream.advance(len + 1)?;
606                if self.stream.is_char_eq(b'/')? {
607                    self.stream.advance_raw(1);
608                    break;
609                }
610            }
611
612            return Ok(true);
613        } else {
614            return Ok(false);
615        }
616    }
617    
618    /// Consumes parenthesized content like "(min-width: 800px)" or "(linux)"
619    /// Handles nested parentheses correctly.
620    fn consume_parenthesized_content(&mut self) -> Result<&'a str, Error> {
621        if !self.stream.is_char_eq(b'(')? {
622            return Err(Error::UnknownToken(self.stream.gen_error_pos()));
623        }
624        
625        let start = self.stream.pos();
626        self.stream.advance_raw(1); // consume opening (
627        
628        let mut depth = 1;
629        
630        while !self.stream.at_end() && depth > 0 {
631            match self.stream.curr_char_raw() {
632                b'(' => {
633                    depth += 1;
634                    self.stream.advance_raw(1);
635                }
636                b')' => {
637                    depth -= 1;
638                    self.stream.advance_raw(1);
639                }
640                b'"' | b'\'' => {
641                    // Skip quoted strings
642                    let quote = self.stream.curr_char_raw();
643                    self.stream.advance_raw(1);
644                    while !self.stream.at_end() {
645                        let c = self.stream.curr_char_raw();
646                        self.stream.advance_raw(1);
647                        if c == quote {
648                            break;
649                        }
650                        if c == b'\\' && !self.stream.at_end() {
651                            self.stream.advance_raw(1); // skip escaped char
652                        }
653                    }
654                }
655                _ => {
656                    self.stream.advance_raw(1);
657                }
658            }
659        }
660        
661        if depth != 0 {
662            return Err(Error::UnknownToken(self.stream.gen_error_pos()));
663        }
664        
665        // Return content including the parentheses
666        let end = self.stream.pos();
667        let s = self.stream.slice_region_raw_str(start, end);
668        self.stream.skip_spaces();
669        Ok(s)
670    }
671}