use std::borrow::Cow;
use thiserror::Error;
#[derive(Debug, Clone, PartialEq)]
pub enum Token<'i> {
Whitespace,
Cdo,
Cdc,
Colon,
Semicolon,
Comma,
LeftSquare,
RightSquare,
LeftParen,
RightParen,
LeftBrace,
RightBrace,
Ident(Cow<'i, str>),
Function(Cow<'i, str>),
AtKeyword(Cow<'i, str>),
Hash {
value: Cow<'i, str>,
is_id: bool,
},
String(Cow<'i, str>),
BadString,
Url(Cow<'i, str>),
BadUrl,
Number(Number),
Percentage(Number),
Dimension {
value: Number,
unit: Cow<'i, str>,
},
Delim(char),
Eof,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Number {
pub value: f64,
pub is_integer: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct SourceLocation {
pub offset: usize,
pub line: u32,
pub column: u32,
}
impl SourceLocation {
pub fn start() -> Self {
Self {
offset: 0,
line: 1,
column: 1,
}
}
}
#[derive(Debug, Error, PartialEq)]
pub enum TokenizerError {
#[error("nul byte at offset {offset}")]
NulByte {
offset: usize,
},
}
pub fn tokenize(input: &str) -> Result<Vec<(Token<'_>, SourceLocation)>, TokenizerError> {
let mut t = Tokenizer::new(input);
let mut out = Vec::new();
loop {
let loc = t.location();
let tok = t.next_token();
let is_eof = matches!(tok, Token::Eof);
out.push((tok, loc));
if is_eof {
break;
}
}
Ok(out)
}
struct Tokenizer<'i> {
input: &'i str,
pos: usize,
line: u32,
column: u32,
}
impl<'i> Tokenizer<'i> {
fn new(input: &'i str) -> Self {
Self {
input,
pos: 0,
line: 1,
column: 1,
}
}
fn location(&self) -> SourceLocation {
SourceLocation {
offset: self.pos,
line: self.line,
column: self.column,
}
}
fn peek(&self) -> Option<char> {
self.input[self.pos..].chars().next()
}
fn peek_nth(&self, n: usize) -> Option<char> {
self.input[self.pos..].chars().nth(n)
}
fn bump(&mut self) -> Option<char> {
let c = self.peek()?;
self.pos += c.len_utf8();
if c == '\n' {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
Some(c)
}
#[allow(dead_code)] fn eat(&mut self, c: char) -> bool {
if self.peek() == Some(c) {
self.bump();
true
} else {
false
}
}
#[allow(dead_code)] fn eat_str(&mut self, s: &str) -> bool {
if self.input[self.pos..].starts_with(s) {
for _ in s.chars() {
self.bump();
}
true
} else {
false
}
}
fn next_token(&mut self) -> Token<'i> {
loop {
if self.input[self.pos..].starts_with("/*") {
self.pos += 2;
self.column += 2;
while self.pos < self.input.len() {
if self.input[self.pos..].starts_with("*/") {
self.pos += 2;
self.column += 2;
break;
}
self.bump();
}
continue;
}
break;
}
let Some(c) = self.peek() else {
return Token::Eof;
};
if is_whitespace(c) {
while self.peek().map(is_whitespace).unwrap_or(false) {
self.bump();
}
return Token::Whitespace;
}
if c == '"' || c == '\'' {
return self.consume_string(c);
}
if self.input[self.pos..].starts_with("<!--") {
self.pos += 4;
self.column += 4;
return Token::Cdo;
}
if self.input[self.pos..].starts_with("-->") {
self.pos += 3;
self.column += 3;
return Token::Cdc;
}
if c == '#' {
self.bump();
let is_id = self
.peek()
.map(|c| would_start_ident(c, self.peek_nth(1), self.peek_nth(2)))
.unwrap_or(false);
if let Some(c) = self.peek() {
if is_name_char(c) || starts_escape(c, self.peek_nth(1)) {
let value = self.consume_name();
return Token::Hash { value, is_id };
}
}
return Token::Delim('#');
}
if c == '@' {
self.bump();
if let Some(c) = self.peek() {
if would_start_ident(c, self.peek_nth(1), self.peek_nth(2)) {
let name = self.consume_name();
return Token::AtKeyword(name);
}
}
return Token::Delim('@');
}
if would_start_number(c, self.peek_nth(1), self.peek_nth(2)) {
let n = self.consume_number();
if let Some(next) = self.peek() {
if next == '%' {
self.bump();
return Token::Percentage(n);
}
if would_start_ident(next, self.peek_nth(1), self.peek_nth(2)) {
let unit = self.consume_name();
return Token::Dimension { value: n, unit };
}
}
return Token::Number(n);
}
if would_start_ident(c, self.peek_nth(1), self.peek_nth(2)) {
return self.consume_ident_like();
}
let punct = match c {
':' => Some(Token::Colon),
';' => Some(Token::Semicolon),
',' => Some(Token::Comma),
'[' => Some(Token::LeftSquare),
']' => Some(Token::RightSquare),
'(' => Some(Token::LeftParen),
')' => Some(Token::RightParen),
'{' => Some(Token::LeftBrace),
'}' => Some(Token::RightBrace),
_ => None,
};
if let Some(tok) = punct {
self.bump();
return tok;
}
self.bump();
Token::Delim(c)
}
fn consume_string(&mut self, end_char: char) -> Token<'i> {
self.bump(); let start = self.pos;
let mut owned: Option<String> = None;
while let Some(c) = self.peek() {
match c {
ch if ch == end_char => {
let result = match owned {
Some(s) => Token::String(Cow::Owned(s)),
None => Token::String(Cow::Borrowed(&self.input[start..self.pos])),
};
self.bump();
return result;
},
'\n' => {
return Token::BadString;
},
'\\' => {
let mut buf = owned.unwrap_or_else(|| self.input[start..self.pos].to_string());
self.bump();
if let Some(esc) = self.consume_escape() {
buf.push(esc);
}
owned = Some(buf);
},
_ => {
if let Some(buf) = owned.as_mut() {
buf.push(c);
}
self.bump();
},
}
}
match owned {
Some(s) => Token::String(Cow::Owned(s)),
None => Token::String(Cow::Borrowed(&self.input[start..self.pos])),
}
}
fn consume_ident_like(&mut self) -> Token<'i> {
let name = self.consume_name();
if self.peek() == Some('(') {
if name.eq_ignore_ascii_case("url") {
self.bump(); while self.peek().map(is_whitespace).unwrap_or(false) {
self.bump();
}
if matches!(self.peek(), Some('"') | Some('\'')) {
let str_tok = self.consume_string(self.peek().unwrap());
while self.peek().map(is_whitespace).unwrap_or(false) {
self.bump();
}
if self.peek() == Some(')') {
self.bump();
if let Token::String(s) = str_tok {
return Token::Url(s);
}
}
while let Some(c) = self.peek() {
if c == ')' {
self.bump();
break;
}
self.bump();
}
return Token::BadUrl;
}
return self.consume_unquoted_url();
}
self.bump();
return Token::Function(name);
}
Token::Ident(name)
}
fn consume_unquoted_url(&mut self) -> Token<'i> {
let mut owned = String::new();
loop {
match self.peek() {
Some(')') => {
self.bump();
return Token::Url(Cow::Owned(owned));
},
None => return Token::Url(Cow::Owned(owned)),
Some(c) if is_whitespace(c) => {
while self.peek().map(is_whitespace).unwrap_or(false) {
self.bump();
}
if self.peek() == Some(')') {
self.bump();
return Token::Url(Cow::Owned(owned));
}
while let Some(c) = self.peek() {
if c == ')' {
self.bump();
break;
}
self.bump();
}
return Token::BadUrl;
},
Some('"') | Some('\'') | Some('(') => {
while let Some(c) = self.peek() {
if c == ')' {
self.bump();
break;
}
self.bump();
}
return Token::BadUrl;
},
Some('\\') => {
self.bump();
if let Some(esc) = self.consume_escape() {
owned.push(esc);
} else {
while let Some(c) = self.peek() {
if c == ')' {
self.bump();
break;
}
self.bump();
}
return Token::BadUrl;
}
},
Some(c) => {
owned.push(c);
self.bump();
},
}
}
}
fn consume_name(&mut self) -> Cow<'i, str> {
let start = self.pos;
let mut owned: Option<String> = None;
while let Some(c) = self.peek() {
if is_name_char(c) {
if let Some(buf) = owned.as_mut() {
buf.push(c);
}
self.bump();
} else if c == '\\' && self.peek_nth(1).map(|c| c != '\n').unwrap_or(true) {
let mut buf = owned.unwrap_or_else(|| self.input[start..self.pos].to_string());
self.bump();
if let Some(esc) = self.consume_escape() {
buf.push(esc);
}
owned = Some(buf);
} else {
break;
}
}
match owned {
Some(s) => Cow::Owned(s),
None => Cow::Borrowed(&self.input[start..self.pos]),
}
}
fn consume_number(&mut self) -> Number {
let start = self.pos;
if matches!(self.peek(), Some('+') | Some('-')) {
self.bump();
}
while self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
self.bump();
}
let mut is_integer = true;
if self.peek() == Some('.')
&& self
.peek_nth(1)
.map(|c| c.is_ascii_digit())
.unwrap_or(false)
{
is_integer = false;
self.bump(); while self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
self.bump();
}
}
if matches!(self.peek(), Some('e') | Some('E')) {
let mut tentative = self.pos;
tentative += 1; let after_e = self.input[tentative..].chars().next();
let after_e_2 = self.input[tentative..].chars().nth(1);
let exp_ok = match (after_e, after_e_2) {
(Some(c), _) if c.is_ascii_digit() => true,
(Some('+'), Some(c)) | (Some('-'), Some(c)) if c.is_ascii_digit() => true,
_ => false,
};
if exp_ok {
is_integer = false;
self.bump();
if matches!(self.peek(), Some('+') | Some('-')) {
self.bump();
}
while self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
self.bump();
}
}
}
let text = &self.input[start..self.pos];
let value = text.parse::<f64>().unwrap_or(0.0);
Number { value, is_integer }
}
fn consume_escape(&mut self) -> Option<char> {
let c = self.peek()?;
if c == '\n' {
return None;
}
if c.is_ascii_hexdigit() {
let mut hex = 0u32;
for _ in 0..6 {
match self.peek() {
Some(d) if d.is_ascii_hexdigit() => {
hex = hex * 16 + d.to_digit(16).unwrap();
self.bump();
},
_ => break,
}
}
if self.peek().map(is_whitespace).unwrap_or(false) {
self.bump();
}
return char::from_u32(hex).or(Some('\u{FFFD}'));
}
self.bump();
Some(c)
}
}
fn is_whitespace(c: char) -> bool {
matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0c')
}
fn is_name_start(c: char) -> bool {
c.is_ascii_alphabetic() || c == '_' || c >= '\u{80}'
}
fn is_name_char(c: char) -> bool {
is_name_start(c) || c.is_ascii_digit() || c == '-'
}
fn starts_escape(c: char, next: Option<char>) -> bool {
c == '\\' && next != Some('\n')
}
fn would_start_ident(c: char, next1: Option<char>, next2: Option<char>) -> bool {
match c {
'-' => match next1 {
Some(c) if is_name_start(c) || c == '-' => true,
Some(c) if starts_escape(c, next2) => true,
_ => false,
},
c if is_name_start(c) => true,
'\\' => starts_escape('\\', next1),
_ => false,
}
}
fn would_start_number(c: char, next1: Option<char>, next2: Option<char>) -> bool {
match c {
'+' | '-' => match next1 {
Some(c) if c.is_ascii_digit() => true,
Some('.') => matches!(next2, Some(c) if c.is_ascii_digit()),
_ => false,
},
'.' => matches!(next1, Some(c) if c.is_ascii_digit()),
c if c.is_ascii_digit() => true,
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn toks(input: &str) -> Vec<Token<'_>> {
tokenize(input)
.unwrap()
.into_iter()
.map(|(t, _)| t)
.filter(|t| !matches!(t, Token::Whitespace))
.collect()
}
fn n(value: f64, is_int: bool) -> Number {
Number {
value,
is_integer: is_int,
}
}
#[test]
fn punctuation() {
let t = toks("{}();:,[]");
assert_eq!(
t,
vec![
Token::LeftBrace,
Token::RightBrace,
Token::LeftParen,
Token::RightParen,
Token::Semicolon,
Token::Colon,
Token::Comma,
Token::LeftSquare,
Token::RightSquare,
Token::Eof,
],
);
}
#[test]
fn ident_and_function() {
let t = toks("display calc( foo-bar --custom-prop");
assert_eq!(
t,
vec![
Token::Ident("display".into()),
Token::Function("calc".into()),
Token::Ident("foo-bar".into()),
Token::Ident("--custom-prop".into()),
Token::Eof,
],
);
}
#[test]
fn numbers_and_dimensions() {
let t = toks("12 1.5 -.5 12px 1.5em 50%");
assert_eq!(
t,
vec![
Token::Number(n(12.0, true)),
Token::Number(n(1.5, false)),
Token::Number(n(-0.5, false)),
Token::Dimension {
value: n(12.0, true),
unit: "px".into()
},
Token::Dimension {
value: n(1.5, false),
unit: "em".into()
},
Token::Percentage(n(50.0, true)),
Token::Eof,
],
);
}
#[test]
fn hash_id_vs_unrestricted() {
let t = toks("#myid #abc #123");
assert_eq!(
t,
vec![
Token::Hash {
value: "myid".into(),
is_id: true,
},
Token::Hash {
value: "abc".into(),
is_id: true,
},
Token::Hash {
value: "123".into(),
is_id: false,
},
Token::Eof,
],
);
}
#[test]
fn at_keywords() {
let t = toks("@media @page @font-face");
assert_eq!(
t,
vec![
Token::AtKeyword("media".into()),
Token::AtKeyword("page".into()),
Token::AtKeyword("font-face".into()),
Token::Eof,
],
);
}
#[test]
fn strings_and_escapes() {
let t = toks(r#""hello" 'world' "es\E9pace""#);
assert_eq!(
t,
vec![
Token::String("hello".into()),
Token::String("world".into()),
Token::String("esépace".into()),
Token::Eof,
],
);
}
#[test]
fn unterminated_string_yields_bad_string() {
let t = toks("\"unterm\nrest");
assert!(matches!(t[0], Token::BadString));
}
#[test]
fn url_quoted_and_unquoted() {
let t = toks(r#"url("https://x") url(file.png) url( spaced )"#);
assert_eq!(
t,
vec![
Token::Url("https://x".into()),
Token::Url("file.png".into()),
Token::Url("spaced".into()),
Token::Eof,
],
);
}
#[test]
fn url_with_internal_whitespace_is_bad() {
let t = toks("url(foo bar)");
assert!(matches!(t[0], Token::BadUrl));
}
#[test]
fn cdo_cdc() {
let t = toks("<!-- color -->");
assert_eq!(
t,
vec![
Token::Cdo,
Token::Ident("color".into()),
Token::Cdc,
Token::Eof,
],
);
}
#[test]
fn comments_skipped() {
let t = toks("/* comment */color/* between */: red /* trailing */");
assert_eq!(
t,
vec![
Token::Ident("color".into()),
Token::Colon,
Token::Ident("red".into()),
Token::Eof,
],
);
}
#[test]
fn whitespace_runs_collapse_to_one_token() {
let raw: Vec<Token<'_>> = tokenize("a b")
.unwrap()
.into_iter()
.map(|(t, _)| t)
.collect();
let ws_count = raw
.iter()
.filter(|t| matches!(t, Token::Whitespace))
.count();
assert_eq!(ws_count, 1);
}
#[test]
fn delim_for_unknown_punct() {
let t = toks("&");
assert_eq!(t, vec![Token::Delim('&'), Token::Eof]);
}
#[test]
fn source_locations_track_lines() {
let stream = tokenize("a\nb").unwrap();
let lines: Vec<u32> = stream.iter().map(|(_, l)| l.line).collect();
assert_eq!(lines, vec![1, 1, 2, 2]);
}
#[test]
fn calc_with_units() {
let t = toks("calc(100% - 10px)");
assert_eq!(
t,
vec![
Token::Function("calc".into()),
Token::Percentage(n(100.0, true)),
Token::Delim('-'),
Token::Dimension {
value: n(10.0, true),
unit: "px".into()
},
Token::RightParen,
Token::Eof,
],
);
}
#[test]
fn rgba_parses_as_function() {
let t = toks("rgba(255, 0, 128, 0.5)");
assert!(matches!(&t[0], Token::Function(name) if name == "rgba"));
}
#[test]
fn at_media_print_block() {
let t = toks("@media print { body { color: black; } }");
assert!(matches!(&t[0], Token::AtKeyword(k) if k == "media"));
assert!(t.contains(&Token::LeftBrace));
assert!(t.contains(&Token::RightBrace));
}
}