use crate::css_parser::{
source::SourceInput,
token::{Token, TokenKind},
};
pub struct Tokenizer<'a> {
input: SourceInput<'a>,
finished: bool,
}
impl<'a> Tokenizer<'a> {
pub fn new(input: &'a str) -> Self {
Self {
input: SourceInput::new(input),
finished: false,
}
}
pub fn next_token(&mut self) -> Token<'a> {
self.consume_comments();
let loc = self.input.location();
match self.input.next_char() {
None => Token {
kind: TokenKind::Eof,
loc,
},
Some(ch) => {
let kind = match ch {
' ' | '\t' | '\n' => {
self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
TokenKind::Whitespace
}
'"' => self.consume_string('"'),
'\'' => self.consume_string('\''),
'#' => {
let next = self.input.current_char();
let next2 = self.input.peek_char(1);
if is_name_char(next) || is_valid_escape(next, next2) {
let is_id = would_start_ident(
self.input.current_char(),
self.input.peek_char(1),
self.input.peek_char(2),
);
let start = self.input.pos();
self.consume_name_chars();
let value = self.input.slice(start, self.input.pos());
TokenKind::Hash { value, is_id }
} else {
TokenKind::Delim('#')
}
}
'(' => TokenKind::OpenParen,
')' => TokenKind::CloseParen,
'+' => {
if would_start_number(
Some('+'),
self.input.current_char(),
self.input.peek_char(1),
) {
self.input.reconsume();
self.consume_numeric()
} else {
TokenKind::Delim('+')
}
}
',' => TokenKind::Comma,
'-' => {
if would_start_number(
Some('-'),
self.input.current_char(),
self.input.peek_char(1),
) {
self.input.reconsume();
self.consume_numeric()
} else if self.input.current_char() == Some('-')
&& self.input.peek_char(1) == Some('>')
{
self.input.next_char();
self.input.next_char();
TokenKind::Cdc
} else if would_start_ident(
Some('-'),
self.input.current_char(),
self.input.peek_char(1),
) {
self.input.reconsume();
self.consume_ident_like()
} else {
TokenKind::Delim('-')
}
}
'.' => {
if would_start_number(
Some('.'),
self.input.current_char(),
self.input.peek_char(1),
) {
self.input.reconsume();
self.consume_numeric()
} else {
TokenKind::Delim('.')
}
}
':' => TokenKind::Colon,
';' => TokenKind::Semicolon,
'<' => {
if self.input.current_char() == Some('!')
&& self.input.peek_char(1) == Some('-')
&& self.input.peek_char(2) == Some('-')
{
self.input.next_char();
self.input.next_char();
self.input.next_char();
TokenKind::Cdo
} else {
TokenKind::Delim('<')
}
}
'@' => {
if would_start_ident(
self.input.current_char(),
self.input.peek_char(1),
self.input.peek_char(2),
) {
let start = self.input.pos();
self.consume_name_chars();
let name = self.input.slice(start, self.input.pos());
TokenKind::AtKeyword(name)
} else {
TokenKind::Delim('@')
}
}
'[' => TokenKind::OpenSquare,
']' => TokenKind::CloseSquare,
'\\' => {
if is_valid_escape(Some('\\'), self.input.current_char()) {
self.input.reconsume();
self.consume_ident_like()
} else {
TokenKind::Delim('\\')
}
}
'{' => TokenKind::OpenCurly,
'}' => TokenKind::CloseCurly,
'0'..='9' => {
self.input.reconsume();
self.consume_numeric()
}
c if is_name_start(c) => {
self.input.reconsume();
self.consume_ident_like()
}
other => TokenKind::Delim(other),
};
Token { kind, loc }
}
}
}
fn consume_comments(&mut self) {
loop {
if self.input.current_char() == Some('/') && self.input.peek_char(1) == Some('*') {
self.input.next_char();
self.input.next_char();
loop {
match self.input.next_char() {
None => return,
Some('*') if self.input.current_char() == Some('/') => {
self.input.next_char();
break;
}
_ => {}
}
}
} else {
return;
}
}
}
fn consume_string(&mut self, ending: char) -> TokenKind<'a> {
let start = self.input.pos();
loop {
match self.input.next_char() {
None => {
let value = self.input.slice(start, self.input.pos());
return TokenKind::String(value);
}
Some(c) if c == ending => {
let value = self.input.slice(start, self.input.pos() - 1);
return TokenKind::String(value);
}
Some('\n') => {
self.input.reconsume();
return TokenKind::BadString;
}
Some('\\') => match self.input.current_char() {
None => {}
Some('\n') => {
self.input.next_char();
}
_ => {
self.consume_escape();
}
},
Some(_) => {}
}
}
}
fn consume_numeric(&mut self) -> TokenKind<'a> {
let (value, int_value, has_sign) = self.consume_number();
if would_start_ident(
self.input.current_char(),
self.input.peek_char(1),
self.input.peek_char(2),
) {
let start = self.input.pos();
self.consume_name_chars();
let unit = self.input.slice(start, self.input.pos());
return TokenKind::Dimension {
value,
int_value,
unit,
};
}
if self.input.current_char() == Some('%') {
self.input.next_char();
return TokenKind::Percentage { value, int_value };
}
TokenKind::Number {
value,
int_value,
has_sign,
}
}
fn consume_number(&mut self) -> (f64, Option<i64>, bool) {
let start = self.input.pos();
let mut is_integer = true;
let mut has_sign = false;
match self.input.current_char() {
Some('+') | Some('-') => {
has_sign = true;
self.input.next_char();
}
_ => {}
}
self.consume_while(|c| c.is_ascii_digit());
if self.input.current_char() == Some('.')
&& self.input.peek_char(1).is_some_and(|c| c.is_ascii_digit())
{
is_integer = false;
self.input.next_char();
self.consume_while(|c| c.is_ascii_digit());
}
if matches!(self.input.current_char(), Some('e') | Some('E')) {
let next = self.input.peek_char(1);
if next.is_some_and(|c| c.is_ascii_digit())
|| (matches!(next, Some('+') | Some('-'))
&& self.input.peek_char(2).is_some_and(|c| c.is_ascii_digit()))
{
is_integer = false;
self.input.next_char();
if matches!(self.input.current_char(), Some('+') | Some('-')) {
self.input.next_char();
}
self.consume_while(|c| c.is_ascii_digit());
}
}
let repr = self.input.slice(start, self.input.pos());
let value: f64 = repr.parse().unwrap_or(0.0);
let int_value = if is_integer {
repr.parse::<i64>().ok()
} else {
None
};
(value, int_value, has_sign)
}
fn consume_ident_like(&mut self) -> TokenKind<'a> {
let start = self.input.pos();
self.consume_name_chars();
let name = self.input.slice(start, self.input.pos());
if name.eq_ignore_ascii_case("url") && self.input.current_char() == Some('(') {
self.input.next_char();
self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
match self.input.current_char() {
Some('"') | Some('\'') => {
return TokenKind::Function(name);
}
_ => {
return self.consume_url();
}
}
}
if self.input.current_char() == Some('(') {
self.input.next_char();
return TokenKind::Function(name);
}
TokenKind::Ident(name)
}
fn consume_url(&mut self) -> TokenKind<'a> {
self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
let start = self.input.pos();
loop {
match self.input.next_char() {
None => {
let value = self.input.slice(start, self.input.pos());
return TokenKind::Url(value.trim_end());
}
Some(')') => {
let end = self.input.pos() - 1;
let value = self.input.slice(start, end).trim_end();
return TokenKind::Url(value);
}
Some(' ') | Some('\t') | Some('\n') => {
let end = self.input.pos() - 1;
self.consume_while(|c| c == ' ' || c == '\t' || c == '\n');
if self.input.current_char() == Some(')') || self.input.is_eof() {
self.input.next_char();
let value = self.input.slice(start, end);
return TokenKind::Url(value);
}
self.consume_bad_url_remnants();
return TokenKind::BadUrl;
}
Some('"') | Some('\'') | Some('(') => {
self.consume_bad_url_remnants();
return TokenKind::BadUrl;
}
Some('\\') => {
if is_valid_escape(Some('\\'), self.input.current_char()) {
self.consume_escape();
} else {
self.consume_bad_url_remnants();
return TokenKind::BadUrl;
}
}
Some(c) if is_non_printable(c) => {
self.consume_bad_url_remnants();
return TokenKind::BadUrl;
}
Some(_) => {}
}
}
}
fn consume_bad_url_remnants(&mut self) {
loop {
match self.input.next_char() {
None | Some(')') => return,
Some('\\') if is_valid_escape(Some('\\'), self.input.current_char()) => {
self.consume_escape();
}
_ => {}
}
}
}
fn consume_escape(&mut self) -> char {
match self.input.next_char() {
None => '\u{FFFD}',
Some(c) if c.is_ascii_hexdigit() => {
let mut hex = String::with_capacity(6);
hex.push(c);
for _ in 0..5 {
match self.input.current_char() {
Some(h) if h.is_ascii_hexdigit() => {
hex.push(h);
self.input.next_char();
}
_ => break,
}
}
if matches!(
self.input.current_char(),
Some(' ') | Some('\t') | Some('\n')
) {
self.input.next_char();
}
u32::from_str_radix(&hex, 16)
.ok()
.and_then(char::from_u32)
.map(|c| if c == '\0' { '\u{FFFD}' } else { c })
.unwrap_or('\u{FFFD}')
}
Some(c) => c,
}
}
fn consume_name_chars(&mut self) {
loop {
match self.input.current_char() {
Some(c) if is_name_char(Some(c)) => {
self.input.next_char();
}
Some('\\') if is_valid_escape(Some('\\'), self.input.peek_char(1)) => {
self.input.next_char();
self.consume_escape();
}
_ => return,
}
}
}
fn consume_while(&mut self, predicate: impl Fn(char) -> bool) {
while let Some(c) = self.input.current_char() {
if predicate(c) {
self.input.next_char();
} else {
break;
}
}
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Token<'a>> {
if self.finished {
return None;
}
let token = self.next_token();
if token.kind == TokenKind::Eof {
self.finished = true;
return None;
}
Some(token)
}
}
fn is_name_start(c: char) -> bool {
c.is_ascii_alphabetic() || !c.is_ascii() || c == '_'
}
fn is_name_char(c: Option<char>) -> bool {
match c {
Some(c) => is_name_start(c) || c.is_ascii_digit() || c == '-',
None => false,
}
}
fn is_non_printable(c: char) -> bool {
matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F')
}
fn is_valid_escape(first: Option<char>, second: Option<char>) -> bool {
first == Some('\\') && second != Some('\n')
}
fn would_start_ident(first: Option<char>, second: Option<char>, third: Option<char>) -> bool {
match first {
Some('-') => {
matches!(second, Some(c) if is_name_start(c) || c == '-')
|| is_valid_escape(second, third)
}
Some(c) if is_name_start(c) => true,
Some('\\') => is_valid_escape(first, second),
_ => false,
}
}
fn would_start_number(first: Option<char>, second: Option<char>, third: Option<char>) -> bool {
match first {
Some('+') | Some('-') => match second {
Some(c) if c.is_ascii_digit() => true,
Some('.') => third.is_some_and(|c| c.is_ascii_digit()),
_ => false,
},
Some('.') => second.is_some_and(|c| c.is_ascii_digit()),
Some(c) if c.is_ascii_digit() => true,
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn tokenize(input: &str) -> Vec<TokenKind<'_>> {
Tokenizer::new(input).map(|t| t.kind).collect()
}
#[test]
fn simple_ident() {
assert_eq!(tokenize("color"), vec![TokenKind::Ident("color")]);
}
#[test]
fn function_token() {
let tokens = tokenize("rgb(255)");
assert_eq!(tokens[0], TokenKind::Function("rgb"));
assert!(matches!(tokens[1], TokenKind::Number { value, .. } if value == 255.0));
assert_eq!(tokens[2], TokenKind::CloseParen);
}
#[test]
fn at_keyword() {
assert_eq!(tokenize("@media"), vec![TokenKind::AtKeyword("media")]);
}
#[test]
fn hash_id() {
let tokens = tokenize("#foo");
assert_eq!(
tokens,
vec![TokenKind::Hash {
value: "foo",
is_id: true
}]
);
}
#[test]
fn string_double_quotes() {
assert_eq!(
tokenize("\"hello world\""),
vec![TokenKind::String("hello world")]
);
}
#[test]
fn number_integer() {
let tokens = tokenize("42");
assert_eq!(
tokens,
vec![TokenKind::Number {
value: 42.0,
int_value: Some(42),
has_sign: false,
}]
);
}
#[test]
fn percentage() {
let tokens = tokenize("50%");
assert_eq!(
tokens,
vec![TokenKind::Percentage {
value: 50.0,
int_value: Some(50),
}]
);
}
#[test]
fn dimension() {
let tokens = tokenize("10px");
assert_eq!(
tokens,
vec![TokenKind::Dimension {
value: 10.0,
int_value: Some(10),
unit: "px",
}]
);
}
#[test]
fn full_rule() {
let tokens = tokenize("h1 { color: red; }");
assert_eq!(
tokens,
vec![
TokenKind::Ident("h1"),
TokenKind::Whitespace,
TokenKind::OpenCurly,
TokenKind::Whitespace,
TokenKind::Ident("color"),
TokenKind::Colon,
TokenKind::Whitespace,
TokenKind::Ident("red"),
TokenKind::Semicolon,
TokenKind::Whitespace,
TokenKind::CloseCurly,
]
);
}
#[test]
fn comment_skipped() {
let tokens = tokenize("a /* comment */ b");
assert_eq!(
tokens,
vec![
TokenKind::Ident("a"),
TokenKind::Whitespace,
TokenKind::Whitespace,
TokenKind::Ident("b"),
]
);
}
#[test]
fn url_token() {
let tokens = tokenize("url(image.png)");
assert_eq!(tokens, vec![TokenKind::Url("image.png")]);
}
#[test]
fn negative_dimension() {
let tokens = tokenize("-10px");
assert_eq!(
tokens,
vec![TokenKind::Dimension {
value: -10.0,
int_value: Some(-10),
unit: "px",
}]
);
}
}