Skip to main content

hpx_browser/css_parser/
tokenizer.rs

1use crate::css_parser::{
2    source::SourceInput,
3    token::{Token, TokenKind},
4};
5
6/// CSS tokenizer per CSS Syntax Level 3 ยง4.
7pub struct Tokenizer<'a> {
8    input: SourceInput<'a>,
9    finished: bool,
10}
11
12impl<'a> Tokenizer<'a> {
13    pub fn new(input: &'a str) -> Self {
14        Self {
15            input: SourceInput::new(input),
16            finished: false,
17        }
18    }
19
20    pub fn next_token(&mut self) -> Token<'a> {
21        self.consume_comments();
22
23        let loc = self.input.location();
24
25        match self.input.next_char() {
26            None => Token {
27                kind: TokenKind::Eof,
28                loc,
29            },
30
31            Some(ch) => {
32                let kind = match ch {
33                    ' ' | '\t' | '\n' => {
34                        self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
35                        TokenKind::Whitespace
36                    }
37
38                    '"' => self.consume_string('"'),
39                    '\'' => self.consume_string('\''),
40
41                    '#' => {
42                        let next = self.input.current_char();
43                        let next2 = self.input.peek_char(1);
44                        if is_name_char(next) || is_valid_escape(next, next2) {
45                            let is_id = would_start_ident(
46                                self.input.current_char(),
47                                self.input.peek_char(1),
48                                self.input.peek_char(2),
49                            );
50                            let start = self.input.pos();
51                            self.consume_name_chars();
52                            let value = self.input.slice(start, self.input.pos());
53                            TokenKind::Hash { value, is_id }
54                        } else {
55                            TokenKind::Delim('#')
56                        }
57                    }
58
59                    '(' => TokenKind::OpenParen,
60                    ')' => TokenKind::CloseParen,
61
62                    '+' => {
63                        if would_start_number(
64                            Some('+'),
65                            self.input.current_char(),
66                            self.input.peek_char(1),
67                        ) {
68                            self.input.reconsume();
69                            self.consume_numeric()
70                        } else {
71                            TokenKind::Delim('+')
72                        }
73                    }
74
75                    ',' => TokenKind::Comma,
76
77                    '-' => {
78                        if would_start_number(
79                            Some('-'),
80                            self.input.current_char(),
81                            self.input.peek_char(1),
82                        ) {
83                            self.input.reconsume();
84                            self.consume_numeric()
85                        } else if self.input.current_char() == Some('-')
86                            && self.input.peek_char(1) == Some('>')
87                        {
88                            self.input.next_char();
89                            self.input.next_char();
90                            TokenKind::Cdc
91                        } else if would_start_ident(
92                            Some('-'),
93                            self.input.current_char(),
94                            self.input.peek_char(1),
95                        ) {
96                            self.input.reconsume();
97                            self.consume_ident_like()
98                        } else {
99                            TokenKind::Delim('-')
100                        }
101                    }
102
103                    '.' => {
104                        if would_start_number(
105                            Some('.'),
106                            self.input.current_char(),
107                            self.input.peek_char(1),
108                        ) {
109                            self.input.reconsume();
110                            self.consume_numeric()
111                        } else {
112                            TokenKind::Delim('.')
113                        }
114                    }
115
116                    ':' => TokenKind::Colon,
117                    ';' => TokenKind::Semicolon,
118
119                    '<' => {
120                        if self.input.current_char() == Some('!')
121                            && self.input.peek_char(1) == Some('-')
122                            && self.input.peek_char(2) == Some('-')
123                        {
124                            self.input.next_char();
125                            self.input.next_char();
126                            self.input.next_char();
127                            TokenKind::Cdo
128                        } else {
129                            TokenKind::Delim('<')
130                        }
131                    }
132
133                    '@' => {
134                        if would_start_ident(
135                            self.input.current_char(),
136                            self.input.peek_char(1),
137                            self.input.peek_char(2),
138                        ) {
139                            let start = self.input.pos();
140                            self.consume_name_chars();
141                            let name = self.input.slice(start, self.input.pos());
142                            TokenKind::AtKeyword(name)
143                        } else {
144                            TokenKind::Delim('@')
145                        }
146                    }
147
148                    '[' => TokenKind::OpenSquare,
149                    ']' => TokenKind::CloseSquare,
150
151                    '\\' => {
152                        if is_valid_escape(Some('\\'), self.input.current_char()) {
153                            self.input.reconsume();
154                            self.consume_ident_like()
155                        } else {
156                            TokenKind::Delim('\\')
157                        }
158                    }
159
160                    '{' => TokenKind::OpenCurly,
161                    '}' => TokenKind::CloseCurly,
162
163                    '0'..='9' => {
164                        self.input.reconsume();
165                        self.consume_numeric()
166                    }
167
168                    c if is_name_start(c) => {
169                        self.input.reconsume();
170                        self.consume_ident_like()
171                    }
172
173                    other => TokenKind::Delim(other),
174                };
175
176                Token { kind, loc }
177            }
178        }
179    }
180
181    fn consume_comments(&mut self) {
182        loop {
183            if self.input.current_char() == Some('/') && self.input.peek_char(1) == Some('*') {
184                self.input.next_char();
185                self.input.next_char();
186                loop {
187                    match self.input.next_char() {
188                        None => return,
189                        Some('*') if self.input.current_char() == Some('/') => {
190                            self.input.next_char();
191                            break;
192                        }
193                        _ => {}
194                    }
195                }
196            } else {
197                return;
198            }
199        }
200    }
201
202    fn consume_string(&mut self, ending: char) -> TokenKind<'a> {
203        let start = self.input.pos();
204        loop {
205            match self.input.next_char() {
206                None => {
207                    let value = self.input.slice(start, self.input.pos());
208                    return TokenKind::String(value);
209                }
210                Some(c) if c == ending => {
211                    let value = self.input.slice(start, self.input.pos() - 1);
212                    return TokenKind::String(value);
213                }
214                Some('\n') => {
215                    self.input.reconsume();
216                    return TokenKind::BadString;
217                }
218                Some('\\') => match self.input.current_char() {
219                    None => {}
220                    Some('\n') => {
221                        self.input.next_char();
222                    }
223                    _ => {
224                        self.consume_escape();
225                    }
226                },
227                Some(_) => {}
228            }
229        }
230    }
231
232    fn consume_numeric(&mut self) -> TokenKind<'a> {
233        let (value, int_value, has_sign) = self.consume_number();
234
235        if would_start_ident(
236            self.input.current_char(),
237            self.input.peek_char(1),
238            self.input.peek_char(2),
239        ) {
240            let start = self.input.pos();
241            self.consume_name_chars();
242            let unit = self.input.slice(start, self.input.pos());
243            return TokenKind::Dimension {
244                value,
245                int_value,
246                unit,
247            };
248        }
249
250        if self.input.current_char() == Some('%') {
251            self.input.next_char();
252            return TokenKind::Percentage { value, int_value };
253        }
254
255        TokenKind::Number {
256            value,
257            int_value,
258            has_sign,
259        }
260    }
261
262    fn consume_number(&mut self) -> (f64, Option<i64>, bool) {
263        let start = self.input.pos();
264        let mut is_integer = true;
265        let mut has_sign = false;
266
267        match self.input.current_char() {
268            Some('+') | Some('-') => {
269                has_sign = true;
270                self.input.next_char();
271            }
272            _ => {}
273        }
274
275        self.consume_while(|c| c.is_ascii_digit());
276
277        if self.input.current_char() == Some('.')
278            && self.input.peek_char(1).is_some_and(|c| c.is_ascii_digit())
279        {
280            is_integer = false;
281            self.input.next_char();
282            self.consume_while(|c| c.is_ascii_digit());
283        }
284
285        if matches!(self.input.current_char(), Some('e') | Some('E')) {
286            let next = self.input.peek_char(1);
287            if next.is_some_and(|c| c.is_ascii_digit())
288                || (matches!(next, Some('+') | Some('-'))
289                    && self.input.peek_char(2).is_some_and(|c| c.is_ascii_digit()))
290            {
291                is_integer = false;
292                self.input.next_char();
293                if matches!(self.input.current_char(), Some('+') | Some('-')) {
294                    self.input.next_char();
295                }
296                self.consume_while(|c| c.is_ascii_digit());
297            }
298        }
299
300        let repr = self.input.slice(start, self.input.pos());
301        let value: f64 = repr.parse().unwrap_or(0.0);
302        let int_value = if is_integer {
303            repr.parse::<i64>().ok()
304        } else {
305            None
306        };
307
308        (value, int_value, has_sign)
309    }
310
311    fn consume_ident_like(&mut self) -> TokenKind<'a> {
312        let start = self.input.pos();
313        self.consume_name_chars();
314        let name = self.input.slice(start, self.input.pos());
315
316        if name.eq_ignore_ascii_case("url") && self.input.current_char() == Some('(') {
317            self.input.next_char();
318            self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
319            match self.input.current_char() {
320                Some('"') | Some('\'') => {
321                    return TokenKind::Function(name);
322                }
323                _ => {
324                    return self.consume_url();
325                }
326            }
327        }
328
329        if self.input.current_char() == Some('(') {
330            self.input.next_char();
331            return TokenKind::Function(name);
332        }
333
334        TokenKind::Ident(name)
335    }
336
337    fn consume_url(&mut self) -> TokenKind<'a> {
338        self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
339        let start = self.input.pos();
340
341        loop {
342            match self.input.next_char() {
343                None => {
344                    let value = self.input.slice(start, self.input.pos());
345                    return TokenKind::Url(value.trim_end());
346                }
347                Some(')') => {
348                    let end = self.input.pos() - 1;
349                    let value = self.input.slice(start, end).trim_end();
350                    return TokenKind::Url(value);
351                }
352                Some(' ') | Some('\t') | Some('\n') => {
353                    let end = self.input.pos() - 1;
354                    self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
355                    if self.input.current_char() == Some(')') || self.input.is_eof() {
356                        self.input.next_char();
357                        let value = self.input.slice(start, end);
358                        return TokenKind::Url(value);
359                    }
360                    self.consume_bad_url_remnants();
361                    return TokenKind::BadUrl;
362                }
363                Some('"') | Some('\'') | Some('(') => {
364                    self.consume_bad_url_remnants();
365                    return TokenKind::BadUrl;
366                }
367                Some('\\') => {
368                    if is_valid_escape(Some('\\'), self.input.current_char()) {
369                        self.consume_escape();
370                    } else {
371                        self.consume_bad_url_remnants();
372                        return TokenKind::BadUrl;
373                    }
374                }
375                Some(c) if is_non_printable(c) => {
376                    self.consume_bad_url_remnants();
377                    return TokenKind::BadUrl;
378                }
379                Some(_) => {}
380            }
381        }
382    }
383
384    fn consume_bad_url_remnants(&mut self) {
385        loop {
386            match self.input.next_char() {
387                None | Some(')') => return,
388                Some('\\') if is_valid_escape(Some('\\'), self.input.current_char()) => {
389                    self.consume_escape();
390                }
391                _ => {}
392            }
393        }
394    }
395
396    fn consume_escape(&mut self) -> char {
397        match self.input.next_char() {
398            None => '\u{FFFD}',
399            Some(c) if c.is_ascii_hexdigit() => {
400                let mut hex = String::with_capacity(6);
401                hex.push(c);
402                for _ in 0..5 {
403                    match self.input.current_char() {
404                        Some(h) if h.is_ascii_hexdigit() => {
405                            hex.push(h);
406                            self.input.next_char();
407                        }
408                        _ => break,
409                    }
410                }
411                if matches!(
412                    self.input.current_char(),
413                    Some(' ') | Some('\t') | Some('\n')
414                ) {
415                    self.input.next_char();
416                }
417                u32::from_str_radix(&hex, 16)
418                    .ok()
419                    .and_then(char::from_u32)
420                    .map(|c| if c == '\0' { '\u{FFFD}' } else { c })
421                    .unwrap_or('\u{FFFD}')
422            }
423            Some(c) => c,
424        }
425    }
426
427    fn consume_name_chars(&mut self) {
428        loop {
429            match self.input.current_char() {
430                Some(c) if is_name_char(Some(c)) => {
431                    self.input.next_char();
432                }
433                Some('\\') if is_valid_escape(Some('\\'), self.input.peek_char(1)) => {
434                    self.input.next_char();
435                    self.consume_escape();
436                }
437                _ => return,
438            }
439        }
440    }
441
442    fn consume_while(&mut self, predicate: impl Fn(char) -> bool) {
443        while let Some(c) = self.input.current_char() {
444            if predicate(c) {
445                self.input.next_char();
446            } else {
447                break;
448            }
449        }
450    }
451}
452
453impl<'a> Iterator for Tokenizer<'a> {
454    type Item = Token<'a>;
455
456    fn next(&mut self) -> Option<Token<'a>> {
457        if self.finished {
458            return None;
459        }
460        let token = self.next_token();
461        if token.kind == TokenKind::Eof {
462            self.finished = true;
463            return None;
464        }
465        Some(token)
466    }
467}
468
469fn is_name_start(c: char) -> bool {
470    c.is_ascii_alphabetic() || !c.is_ascii() || c == '_'
471}
472
473fn is_name_char(c: Option<char>) -> bool {
474    match c {
475        Some(c) => is_name_start(c) || c.is_ascii_digit() || c == '-',
476        None => false,
477    }
478}
479
480fn is_non_printable(c: char) -> bool {
481    matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F')
482}
483
484fn is_valid_escape(first: Option<char>, second: Option<char>) -> bool {
485    first == Some('\\') && second != Some('\n')
486}
487
488fn would_start_ident(first: Option<char>, second: Option<char>, third: Option<char>) -> bool {
489    match first {
490        Some('-') => {
491            matches!(second, Some(c) if is_name_start(c) || c == '-')
492                || is_valid_escape(second, third)
493        }
494        Some(c) if is_name_start(c) => true,
495        Some('\\') => is_valid_escape(first, second),
496        _ => false,
497    }
498}
499
500fn would_start_number(first: Option<char>, second: Option<char>, third: Option<char>) -> bool {
501    match first {
502        Some('+') | Some('-') => match second {
503            Some(c) if c.is_ascii_digit() => true,
504            Some('.') => third.is_some_and(|c| c.is_ascii_digit()),
505            _ => false,
506        },
507        Some('.') => second.is_some_and(|c| c.is_ascii_digit()),
508        Some(c) if c.is_ascii_digit() => true,
509        _ => false,
510    }
511}
512
513#[cfg(test)]
514mod tests {
515    use super::*;
516
517    fn tokenize(input: &str) -> Vec<TokenKind<'_>> {
518        Tokenizer::new(input).map(|t| t.kind).collect()
519    }
520
521    #[test]
522    fn simple_ident() {
523        assert_eq!(tokenize("color"), vec![TokenKind::Ident("color")]);
524    }
525
526    #[test]
527    fn function_token() {
528        let tokens = tokenize("rgb(255)");
529        assert_eq!(tokens[0], TokenKind::Function("rgb"));
530        assert!(matches!(tokens[1], TokenKind::Number { value, .. } if value == 255.0));
531        assert_eq!(tokens[2], TokenKind::CloseParen);
532    }
533
534    #[test]
535    fn at_keyword() {
536        assert_eq!(tokenize("@media"), vec![TokenKind::AtKeyword("media")]);
537    }
538
539    #[test]
540    fn hash_id() {
541        let tokens = tokenize("#foo");
542        assert_eq!(
543            tokens,
544            vec![TokenKind::Hash {
545                value: "foo",
546                is_id: true
547            }]
548        );
549    }
550
551    #[test]
552    fn string_double_quotes() {
553        assert_eq!(
554            tokenize("\"hello world\""),
555            vec![TokenKind::String("hello world")]
556        );
557    }
558
559    #[test]
560    fn number_integer() {
561        let tokens = tokenize("42");
562        assert_eq!(
563            tokens,
564            vec![TokenKind::Number {
565                value: 42.0,
566                int_value: Some(42),
567                has_sign: false,
568            }]
569        );
570    }
571
572    #[test]
573    fn percentage() {
574        let tokens = tokenize("50%");
575        assert_eq!(
576            tokens,
577            vec![TokenKind::Percentage {
578                value: 50.0,
579                int_value: Some(50),
580            }]
581        );
582    }
583
584    #[test]
585    fn dimension() {
586        let tokens = tokenize("10px");
587        assert_eq!(
588            tokens,
589            vec![TokenKind::Dimension {
590                value: 10.0,
591                int_value: Some(10),
592                unit: "px",
593            }]
594        );
595    }
596
597    #[test]
598    fn full_rule() {
599        let tokens = tokenize("h1 { color: red; }");
600        assert_eq!(
601            tokens,
602            vec![
603                TokenKind::Ident("h1"),
604                TokenKind::Whitespace,
605                TokenKind::OpenCurly,
606                TokenKind::Whitespace,
607                TokenKind::Ident("color"),
608                TokenKind::Colon,
609                TokenKind::Whitespace,
610                TokenKind::Ident("red"),
611                TokenKind::Semicolon,
612                TokenKind::Whitespace,
613                TokenKind::CloseCurly,
614            ]
615        );
616    }
617
618    #[test]
619    fn comment_skipped() {
620        let tokens = tokenize("a /* comment */ b");
621        assert_eq!(
622            tokens,
623            vec![
624                TokenKind::Ident("a"),
625                TokenKind::Whitespace,
626                TokenKind::Whitespace,
627                TokenKind::Ident("b"),
628            ]
629        );
630    }
631
632    #[test]
633    fn url_token() {
634        let tokens = tokenize("url(image.png)");
635        assert_eq!(tokens, vec![TokenKind::Url("image.png")]);
636    }
637
638    #[test]
639    fn negative_dimension() {
640        let tokens = tokenize("-10px");
641        assert_eq!(
642            tokens,
643            vec![TokenKind::Dimension {
644                value: -10.0,
645                int_value: Some(-10),
646                unit: "px",
647            }]
648        );
649    }
650}