1use crate::css_parser::{
2 source::SourceInput,
3 token::{Token, TokenKind},
4};
5
6pub struct Tokenizer<'a> {
8 input: SourceInput<'a>,
9 finished: bool,
10}
11
12impl<'a> Tokenizer<'a> {
13 pub fn new(input: &'a str) -> Self {
14 Self {
15 input: SourceInput::new(input),
16 finished: false,
17 }
18 }
19
20 pub fn next_token(&mut self) -> Token<'a> {
21 self.consume_comments();
22
23 let loc = self.input.location();
24
25 match self.input.next_char() {
26 None => Token {
27 kind: TokenKind::Eof,
28 loc,
29 },
30
31 Some(ch) => {
32 let kind = match ch {
33 ' ' | '\t' | '\n' => {
34 self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
35 TokenKind::Whitespace
36 }
37
38 '"' => self.consume_string('"'),
39 '\'' => self.consume_string('\''),
40
41 '#' => {
42 let next = self.input.current_char();
43 let next2 = self.input.peek_char(1);
44 if is_name_char(next) || is_valid_escape(next, next2) {
45 let is_id = would_start_ident(
46 self.input.current_char(),
47 self.input.peek_char(1),
48 self.input.peek_char(2),
49 );
50 let start = self.input.pos();
51 self.consume_name_chars();
52 let value = self.input.slice(start, self.input.pos());
53 TokenKind::Hash { value, is_id }
54 } else {
55 TokenKind::Delim('#')
56 }
57 }
58
59 '(' => TokenKind::OpenParen,
60 ')' => TokenKind::CloseParen,
61
62 '+' => {
63 if would_start_number(
64 Some('+'),
65 self.input.current_char(),
66 self.input.peek_char(1),
67 ) {
68 self.input.reconsume();
69 self.consume_numeric()
70 } else {
71 TokenKind::Delim('+')
72 }
73 }
74
75 ',' => TokenKind::Comma,
76
77 '-' => {
78 if would_start_number(
79 Some('-'),
80 self.input.current_char(),
81 self.input.peek_char(1),
82 ) {
83 self.input.reconsume();
84 self.consume_numeric()
85 } else if self.input.current_char() == Some('-')
86 && self.input.peek_char(1) == Some('>')
87 {
88 self.input.next_char();
89 self.input.next_char();
90 TokenKind::Cdc
91 } else if would_start_ident(
92 Some('-'),
93 self.input.current_char(),
94 self.input.peek_char(1),
95 ) {
96 self.input.reconsume();
97 self.consume_ident_like()
98 } else {
99 TokenKind::Delim('-')
100 }
101 }
102
103 '.' => {
104 if would_start_number(
105 Some('.'),
106 self.input.current_char(),
107 self.input.peek_char(1),
108 ) {
109 self.input.reconsume();
110 self.consume_numeric()
111 } else {
112 TokenKind::Delim('.')
113 }
114 }
115
116 ':' => TokenKind::Colon,
117 ';' => TokenKind::Semicolon,
118
119 '<' => {
120 if self.input.current_char() == Some('!')
121 && self.input.peek_char(1) == Some('-')
122 && self.input.peek_char(2) == Some('-')
123 {
124 self.input.next_char();
125 self.input.next_char();
126 self.input.next_char();
127 TokenKind::Cdo
128 } else {
129 TokenKind::Delim('<')
130 }
131 }
132
133 '@' => {
134 if would_start_ident(
135 self.input.current_char(),
136 self.input.peek_char(1),
137 self.input.peek_char(2),
138 ) {
139 let start = self.input.pos();
140 self.consume_name_chars();
141 let name = self.input.slice(start, self.input.pos());
142 TokenKind::AtKeyword(name)
143 } else {
144 TokenKind::Delim('@')
145 }
146 }
147
148 '[' => TokenKind::OpenSquare,
149 ']' => TokenKind::CloseSquare,
150
151 '\\' => {
152 if is_valid_escape(Some('\\'), self.input.current_char()) {
153 self.input.reconsume();
154 self.consume_ident_like()
155 } else {
156 TokenKind::Delim('\\')
157 }
158 }
159
160 '{' => TokenKind::OpenCurly,
161 '}' => TokenKind::CloseCurly,
162
163 '0'..='9' => {
164 self.input.reconsume();
165 self.consume_numeric()
166 }
167
168 c if is_name_start(c) => {
169 self.input.reconsume();
170 self.consume_ident_like()
171 }
172
173 other => TokenKind::Delim(other),
174 };
175
176 Token { kind, loc }
177 }
178 }
179 }
180
181 fn consume_comments(&mut self) {
182 loop {
183 if self.input.current_char() == Some('/') && self.input.peek_char(1) == Some('*') {
184 self.input.next_char();
185 self.input.next_char();
186 loop {
187 match self.input.next_char() {
188 None => return,
189 Some('*') if self.input.current_char() == Some('/') => {
190 self.input.next_char();
191 break;
192 }
193 _ => {}
194 }
195 }
196 } else {
197 return;
198 }
199 }
200 }
201
202 fn consume_string(&mut self, ending: char) -> TokenKind<'a> {
203 let start = self.input.pos();
204 loop {
205 match self.input.next_char() {
206 None => {
207 let value = self.input.slice(start, self.input.pos());
208 return TokenKind::String(value);
209 }
210 Some(c) if c == ending => {
211 let value = self.input.slice(start, self.input.pos() - 1);
212 return TokenKind::String(value);
213 }
214 Some('\n') => {
215 self.input.reconsume();
216 return TokenKind::BadString;
217 }
218 Some('\\') => match self.input.current_char() {
219 None => {}
220 Some('\n') => {
221 self.input.next_char();
222 }
223 _ => {
224 self.consume_escape();
225 }
226 },
227 Some(_) => {}
228 }
229 }
230 }
231
232 fn consume_numeric(&mut self) -> TokenKind<'a> {
233 let (value, int_value, has_sign) = self.consume_number();
234
235 if would_start_ident(
236 self.input.current_char(),
237 self.input.peek_char(1),
238 self.input.peek_char(2),
239 ) {
240 let start = self.input.pos();
241 self.consume_name_chars();
242 let unit = self.input.slice(start, self.input.pos());
243 return TokenKind::Dimension {
244 value,
245 int_value,
246 unit,
247 };
248 }
249
250 if self.input.current_char() == Some('%') {
251 self.input.next_char();
252 return TokenKind::Percentage { value, int_value };
253 }
254
255 TokenKind::Number {
256 value,
257 int_value,
258 has_sign,
259 }
260 }
261
262 fn consume_number(&mut self) -> (f64, Option<i64>, bool) {
263 let start = self.input.pos();
264 let mut is_integer = true;
265 let mut has_sign = false;
266
267 match self.input.current_char() {
268 Some('+') | Some('-') => {
269 has_sign = true;
270 self.input.next_char();
271 }
272 _ => {}
273 }
274
275 self.consume_while(|c| c.is_ascii_digit());
276
277 if self.input.current_char() == Some('.')
278 && self.input.peek_char(1).is_some_and(|c| c.is_ascii_digit())
279 {
280 is_integer = false;
281 self.input.next_char();
282 self.consume_while(|c| c.is_ascii_digit());
283 }
284
285 if matches!(self.input.current_char(), Some('e') | Some('E')) {
286 let next = self.input.peek_char(1);
287 if next.is_some_and(|c| c.is_ascii_digit())
288 || (matches!(next, Some('+') | Some('-'))
289 && self.input.peek_char(2).is_some_and(|c| c.is_ascii_digit()))
290 {
291 is_integer = false;
292 self.input.next_char();
293 if matches!(self.input.current_char(), Some('+') | Some('-')) {
294 self.input.next_char();
295 }
296 self.consume_while(|c| c.is_ascii_digit());
297 }
298 }
299
300 let repr = self.input.slice(start, self.input.pos());
301 let value: f64 = repr.parse().unwrap_or(0.0);
302 let int_value = if is_integer {
303 repr.parse::<i64>().ok()
304 } else {
305 None
306 };
307
308 (value, int_value, has_sign)
309 }
310
311 fn consume_ident_like(&mut self) -> TokenKind<'a> {
312 let start = self.input.pos();
313 self.consume_name_chars();
314 let name = self.input.slice(start, self.input.pos());
315
316 if name.eq_ignore_ascii_case("url") && self.input.current_char() == Some('(') {
317 self.input.next_char();
318 self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
319 match self.input.current_char() {
320 Some('"') | Some('\'') => {
321 return TokenKind::Function(name);
322 }
323 _ => {
324 return self.consume_url();
325 }
326 }
327 }
328
329 if self.input.current_char() == Some('(') {
330 self.input.next_char();
331 return TokenKind::Function(name);
332 }
333
334 TokenKind::Ident(name)
335 }
336
337 fn consume_url(&mut self) -> TokenKind<'a> {
338 self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
339 let start = self.input.pos();
340
341 loop {
342 match self.input.next_char() {
343 None => {
344 let value = self.input.slice(start, self.input.pos());
345 return TokenKind::Url(value.trim_end());
346 }
347 Some(')') => {
348 let end = self.input.pos() - 1;
349 let value = self.input.slice(start, end).trim_end();
350 return TokenKind::Url(value);
351 }
352 Some(' ') | Some('\t') | Some('\n') => {
353 let end = self.input.pos() - 1;
354 self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
355 if self.input.current_char() == Some(')') || self.input.is_eof() {
356 self.input.next_char();
357 let value = self.input.slice(start, end);
358 return TokenKind::Url(value);
359 }
360 self.consume_bad_url_remnants();
361 return TokenKind::BadUrl;
362 }
363 Some('"') | Some('\'') | Some('(') => {
364 self.consume_bad_url_remnants();
365 return TokenKind::BadUrl;
366 }
367 Some('\\') => {
368 if is_valid_escape(Some('\\'), self.input.current_char()) {
369 self.consume_escape();
370 } else {
371 self.consume_bad_url_remnants();
372 return TokenKind::BadUrl;
373 }
374 }
375 Some(c) if is_non_printable(c) => {
376 self.consume_bad_url_remnants();
377 return TokenKind::BadUrl;
378 }
379 Some(_) => {}
380 }
381 }
382 }
383
384 fn consume_bad_url_remnants(&mut self) {
385 loop {
386 match self.input.next_char() {
387 None | Some(')') => return,
388 Some('\\') if is_valid_escape(Some('\\'), self.input.current_char()) => {
389 self.consume_escape();
390 }
391 _ => {}
392 }
393 }
394 }
395
396 fn consume_escape(&mut self) -> char {
397 match self.input.next_char() {
398 None => '\u{FFFD}',
399 Some(c) if c.is_ascii_hexdigit() => {
400 let mut hex = String::with_capacity(6);
401 hex.push(c);
402 for _ in 0..5 {
403 match self.input.current_char() {
404 Some(h) if h.is_ascii_hexdigit() => {
405 hex.push(h);
406 self.input.next_char();
407 }
408 _ => break,
409 }
410 }
411 if matches!(
412 self.input.current_char(),
413 Some(' ') | Some('\t') | Some('\n')
414 ) {
415 self.input.next_char();
416 }
417 u32::from_str_radix(&hex, 16)
418 .ok()
419 .and_then(char::from_u32)
420 .map(|c| if c == '\0' { '\u{FFFD}' } else { c })
421 .unwrap_or('\u{FFFD}')
422 }
423 Some(c) => c,
424 }
425 }
426
427 fn consume_name_chars(&mut self) {
428 loop {
429 match self.input.current_char() {
430 Some(c) if is_name_char(Some(c)) => {
431 self.input.next_char();
432 }
433 Some('\\') if is_valid_escape(Some('\\'), self.input.peek_char(1)) => {
434 self.input.next_char();
435 self.consume_escape();
436 }
437 _ => return,
438 }
439 }
440 }
441
442 fn consume_while(&mut self, predicate: impl Fn(char) -> bool) {
443 while let Some(c) = self.input.current_char() {
444 if predicate(c) {
445 self.input.next_char();
446 } else {
447 break;
448 }
449 }
450 }
451}
452
453impl<'a> Iterator for Tokenizer<'a> {
454 type Item = Token<'a>;
455
456 fn next(&mut self) -> Option<Token<'a>> {
457 if self.finished {
458 return None;
459 }
460 let token = self.next_token();
461 if token.kind == TokenKind::Eof {
462 self.finished = true;
463 return None;
464 }
465 Some(token)
466 }
467}
468
469fn is_name_start(c: char) -> bool {
470 c.is_ascii_alphabetic() || !c.is_ascii() || c == '_'
471}
472
473fn is_name_char(c: Option<char>) -> bool {
474 match c {
475 Some(c) => is_name_start(c) || c.is_ascii_digit() || c == '-',
476 None => false,
477 }
478}
479
480fn is_non_printable(c: char) -> bool {
481 matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F')
482}
483
484fn is_valid_escape(first: Option<char>, second: Option<char>) -> bool {
485 first == Some('\\') && second != Some('\n')
486}
487
488fn would_start_ident(first: Option<char>, second: Option<char>, third: Option<char>) -> bool {
489 match first {
490 Some('-') => {
491 matches!(second, Some(c) if is_name_start(c) || c == '-')
492 || is_valid_escape(second, third)
493 }
494 Some(c) if is_name_start(c) => true,
495 Some('\\') => is_valid_escape(first, second),
496 _ => false,
497 }
498}
499
500fn would_start_number(first: Option<char>, second: Option<char>, third: Option<char>) -> bool {
501 match first {
502 Some('+') | Some('-') => match second {
503 Some(c) if c.is_ascii_digit() => true,
504 Some('.') => third.is_some_and(|c| c.is_ascii_digit()),
505 _ => false,
506 },
507 Some('.') => second.is_some_and(|c| c.is_ascii_digit()),
508 Some(c) if c.is_ascii_digit() => true,
509 _ => false,
510 }
511}
512
513#[cfg(test)]
514mod tests {
515 use super::*;
516
517 fn tokenize(input: &str) -> Vec<TokenKind<'_>> {
518 Tokenizer::new(input).map(|t| t.kind).collect()
519 }
520
521 #[test]
522 fn simple_ident() {
523 assert_eq!(tokenize("color"), vec![TokenKind::Ident("color")]);
524 }
525
526 #[test]
527 fn function_token() {
528 let tokens = tokenize("rgb(255)");
529 assert_eq!(tokens[0], TokenKind::Function("rgb"));
530 assert!(matches!(tokens[1], TokenKind::Number { value, .. } if value == 255.0));
531 assert_eq!(tokens[2], TokenKind::CloseParen);
532 }
533
534 #[test]
535 fn at_keyword() {
536 assert_eq!(tokenize("@media"), vec![TokenKind::AtKeyword("media")]);
537 }
538
539 #[test]
540 fn hash_id() {
541 let tokens = tokenize("#foo");
542 assert_eq!(
543 tokens,
544 vec![TokenKind::Hash {
545 value: "foo",
546 is_id: true
547 }]
548 );
549 }
550
551 #[test]
552 fn string_double_quotes() {
553 assert_eq!(
554 tokenize("\"hello world\""),
555 vec![TokenKind::String("hello world")]
556 );
557 }
558
559 #[test]
560 fn number_integer() {
561 let tokens = tokenize("42");
562 assert_eq!(
563 tokens,
564 vec![TokenKind::Number {
565 value: 42.0,
566 int_value: Some(42),
567 has_sign: false,
568 }]
569 );
570 }
571
572 #[test]
573 fn percentage() {
574 let tokens = tokenize("50%");
575 assert_eq!(
576 tokens,
577 vec![TokenKind::Percentage {
578 value: 50.0,
579 int_value: Some(50),
580 }]
581 );
582 }
583
584 #[test]
585 fn dimension() {
586 let tokens = tokenize("10px");
587 assert_eq!(
588 tokens,
589 vec![TokenKind::Dimension {
590 value: 10.0,
591 int_value: Some(10),
592 unit: "px",
593 }]
594 );
595 }
596
597 #[test]
598 fn full_rule() {
599 let tokens = tokenize("h1 { color: red; }");
600 assert_eq!(
601 tokens,
602 vec![
603 TokenKind::Ident("h1"),
604 TokenKind::Whitespace,
605 TokenKind::OpenCurly,
606 TokenKind::Whitespace,
607 TokenKind::Ident("color"),
608 TokenKind::Colon,
609 TokenKind::Whitespace,
610 TokenKind::Ident("red"),
611 TokenKind::Semicolon,
612 TokenKind::Whitespace,
613 TokenKind::CloseCurly,
614 ]
615 );
616 }
617
618 #[test]
619 fn comment_skipped() {
620 let tokens = tokenize("a /* comment */ b");
621 assert_eq!(
622 tokens,
623 vec![
624 TokenKind::Ident("a"),
625 TokenKind::Whitespace,
626 TokenKind::Whitespace,
627 TokenKind::Ident("b"),
628 ]
629 );
630 }
631
632 #[test]
633 fn url_token() {
634 let tokens = tokenize("url(image.png)");
635 assert_eq!(tokens, vec![TokenKind::Url("image.png")]);
636 }
637
638 #[test]
639 fn negative_dimension() {
640 let tokens = tokenize("-10px");
641 assert_eq!(
642 tokens,
643 vec![TokenKind::Dimension {
644 value: -10.0,
645 int_value: Some(-10),
646 unit: "px",
647 }]
648 );
649 }
650}