use std::ops::Range;
use std::cell::Cell;
use std::char;
use std::ascii::AsciiExt;
use std::borrow::{Cow, ToOwned};
use std::borrow::Cow::{Owned, Borrowed};
use std::i32;
use self::Token::*;
#[derive(PartialEq, Debug, Clone)]
pub enum Token<'a> {
Ident(Cow<'a, str>),
AtKeyword(Cow<'a, str>),
Hash(Cow<'a, str>),
IDHash(Cow<'a, str>),
QuotedString(Cow<'a, str>),
Url(Cow<'a, str>),
Delim(char),
Number(NumericValue),
Percentage(PercentageValue),
Dimension(NumericValue, Cow<'a, str>),
UnicodeRange(u32, u32),
WhiteSpace(&'a str),
Comment(&'a str),
Colon,
Semicolon,
Comma,
IncludeMatch,
DashMatch,
PrefixMatch,
SuffixMatch,
SubstringMatch,
Column,
CDO,
CDC,
Function(Cow<'a, str>),
ParenthesisBlock,
SquareBracketBlock,
CurlyBracketBlock,
BadUrl,
BadString,
CloseParenthesis,
CloseSquareBracket,
CloseCurlyBracket,
}
impl<'a> Token<'a> {
pub fn is_parse_error(&self) -> bool {
matches!(
*self,
BadUrl | BadString | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
)
}
}
#[derive(PartialEq, Debug, Copy, Clone)]
pub struct NumericValue {
pub value: f32,
pub int_value: Option<i32>,
pub has_sign: bool,
}
#[derive(PartialEq, Debug, Copy, Clone)]
pub struct PercentageValue {
pub unit_value: f32,
pub int_value: Option<i32>,
pub has_sign: bool,
}
#[derive(Clone)]
pub struct Tokenizer<'a> {
input: &'a str,
position: usize,
last_known_line_break: Cell<(usize, usize)>,
var_functions: VarFunctions,
}
#[derive(Copy, Clone, PartialEq, Eq)]
enum VarFunctions {
DontCare,
LookingForThem,
SeenAtLeastOne,
}
impl<'a> Tokenizer<'a> {
#[inline]
pub fn new(input: &str) -> Tokenizer {
Tokenizer {
input: input,
position: 0,
last_known_line_break: Cell::new((1, 0)),
var_functions: VarFunctions::DontCare,
}
}
#[inline]
pub fn look_for_var_functions(&mut self) {
self.var_functions = VarFunctions::LookingForThem;
}
#[inline]
pub fn seen_var_functions(&mut self) -> bool {
let seen = self.var_functions == VarFunctions::SeenAtLeastOne;
self.var_functions = VarFunctions::DontCare;
seen
}
#[inline]
pub fn next(&mut self) -> Result<Token<'a>, ()> {
next_token(self).ok_or(())
}
#[inline]
pub fn position(&self) -> SourcePosition {
SourcePosition(self.position)
}
#[inline]
pub fn reset(&mut self, new_position: SourcePosition) {
self.position = new_position.0;
}
#[inline]
pub fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
&self.input[start_pos.0..self.position]
}
#[inline]
pub fn slice(&self, range: Range<SourcePosition>) -> &'a str {
&self.input[range.start.0..range.end.0]
}
#[inline]
pub fn current_source_location(&self) -> SourceLocation {
let position = SourcePosition(self.position);
self.source_location(position)
}
pub fn source_location(&self, position: SourcePosition) -> SourceLocation {
let target = position.0;
let mut line_number;
let mut position;
let (last_known_line_number, position_after_last_known_newline) =
self.last_known_line_break.get();
if target >= position_after_last_known_newline {
position = position_after_last_known_newline;
line_number = last_known_line_number;
} else {
position = 0;
line_number = 1;
}
let mut source = &self.input[position..target];
while let Some(newline_position) = source.find(&['\n', '\r', '\x0C'][..]) {
let offset = newline_position +
if source[newline_position..].starts_with("\r\n") {
2
} else {
1
};
source = &source[offset..];
position += offset;
line_number += 1;
}
debug_assert!(position <= target);
self.last_known_line_break.set((line_number, position));
SourceLocation {
line: line_number,
column: target - position + 1,
}
}
#[inline]
pub fn next_byte(&self) -> Option<u8> {
if self.is_eof() {
None
} else {
Some(self.input.as_bytes()[self.position])
}
}
#[inline]
fn is_eof(&self) -> bool { !self.has_at_least(0) }
#[inline]
fn has_at_least(&self, n: usize) -> bool { self.position + n < self.input.len() }
#[inline]
pub fn advance(&mut self, n: usize) { self.position += n }
#[inline]
fn next_char(&self) -> char { self.char_at(0) }
#[inline]
fn char_at(&self, offset: usize) -> char {
self.input[self.position + offset..].chars().next().unwrap()
}
#[inline]
fn has_newline_at(&self, offset: usize) -> bool {
self.position + offset < self.input.len() &&
matches!(self.char_at(offset), '\n' | '\r' | '\x0C')
}
#[inline]
fn consume_char(&mut self) -> char {
let c = self.next_char();
self.position += c.len_utf8();
c
}
#[inline]
fn starts_with(&self, needle: &str) -> bool {
self.input[self.position..].starts_with(needle)
}
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
pub struct SourcePosition(usize);
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct SourceLocation {
pub line: usize,
pub column: usize,
}
fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Option<Token<'a>> {
if tokenizer.is_eof() {
return None
}
let c = tokenizer.next_char();
let token = match c {
'\t' | '\n' | ' ' | '\r' | '\x0C' => {
let start_position = tokenizer.position();
tokenizer.advance(1);
while !tokenizer.is_eof() {
match tokenizer.next_char() {
' ' | '\t' | '\n' | '\r' | '\x0C' => tokenizer.advance(1),
_ => break,
}
}
WhiteSpace(tokenizer.slice_from(start_position))
},
'"' => consume_string(tokenizer, false),
'#' => {
tokenizer.advance(1);
if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
else if !tokenizer.is_eof() && match tokenizer.next_char() {
'a'...'z' | 'A'...'Z' | '0'...'9' | '-' | '_' => true,
'\\' => !tokenizer.has_newline_at(1),
_ => c > '\x7F', } { Hash(consume_name(tokenizer)) }
else { Delim(c) }
},
'$' => {
if tokenizer.starts_with("$=") { tokenizer.advance(2); SuffixMatch }
else { tokenizer.advance(1); Delim(c) }
},
'\'' => consume_string(tokenizer, true),
'(' => { tokenizer.advance(1); ParenthesisBlock },
')' => { tokenizer.advance(1); CloseParenthesis },
'*' => {
if tokenizer.starts_with("*=") { tokenizer.advance(2); SubstringMatch }
else { tokenizer.advance(1); Delim(c) }
},
'+' => {
if (
tokenizer.has_at_least(1)
&& matches!(tokenizer.char_at(1), '0'...'9')
) || (
tokenizer.has_at_least(2)
&& tokenizer.char_at(1) == '.'
&& matches!(tokenizer.char_at(2), '0'...'9')
) {
consume_numeric(tokenizer)
} else {
tokenizer.advance(1);
Delim(c)
}
},
',' => { tokenizer.advance(1); Comma },
'-' => {
if (
tokenizer.has_at_least(1)
&& matches!(tokenizer.char_at(1), '0'...'9')
) || (
tokenizer.has_at_least(2)
&& tokenizer.char_at(1) == '.'
&& matches!(tokenizer.char_at(2), '0'...'9')
) {
consume_numeric(tokenizer)
} else if tokenizer.starts_with("-->") {
tokenizer.advance(3);
CDC
} else if is_ident_start(tokenizer) {
consume_ident_like(tokenizer)
} else {
tokenizer.advance(1);
Delim(c)
}
},
'.' => {
if tokenizer.has_at_least(1)
&& matches!(tokenizer.char_at(1), '0'...'9'
) {
consume_numeric(tokenizer)
} else {
tokenizer.advance(1);
Delim(c)
}
}
'/' if tokenizer.starts_with("/*") => {
tokenizer.advance(2); let start_position = tokenizer.position();
let content;
match tokenizer.input[tokenizer.position..].find("*/") {
Some(offset) => {
tokenizer.advance(offset);
content = tokenizer.slice_from(start_position);
tokenizer.advance(2);
}
None => {
tokenizer.position = tokenizer.input.len();
content = tokenizer.slice_from(start_position);
}
}
Comment(content)
}
'0'...'9' => consume_numeric(tokenizer),
':' => { tokenizer.advance(1); Colon },
';' => { tokenizer.advance(1); Semicolon },
'<' => {
if tokenizer.starts_with("<!--") {
tokenizer.advance(4);
CDO
} else {
tokenizer.advance(1);
Delim(c)
}
},
'@' => {
tokenizer.advance(1);
if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
else { Delim(c) }
},
'u' | 'U' => {
if tokenizer.has_at_least(2)
&& tokenizer.char_at(1) == '+'
&& matches!(tokenizer.char_at(2), '0'...'9' | 'a'...'f' | 'A'...'F' | '?')
{ consume_unicode_range(tokenizer) }
else { consume_ident_like(tokenizer) }
},
'a'...'z' | 'A'...'Z' | '_' | '\0' => consume_ident_like(tokenizer),
'[' => { tokenizer.advance(1); SquareBracketBlock },
'\\' => {
if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
else { tokenizer.advance(1); Delim(c) }
},
']' => { tokenizer.advance(1); CloseSquareBracket },
'^' => {
if tokenizer.starts_with("^=") { tokenizer.advance(2); PrefixMatch }
else { tokenizer.advance(1); Delim(c) }
},
'{' => { tokenizer.advance(1); CurlyBracketBlock },
'|' => {
if tokenizer.starts_with("|=") { tokenizer.advance(2); DashMatch }
else if tokenizer.starts_with("||") { tokenizer.advance(2); Column }
else { tokenizer.advance(1); Delim(c) }
},
'}' => { tokenizer.advance(1); CloseCurlyBracket },
'~' => {
if tokenizer.starts_with("~=") { tokenizer.advance(2); IncludeMatch }
else { tokenizer.advance(1); Delim(c) }
},
_ => {
if c > '\x7F' { consume_ident_like(tokenizer)
} else {
tokenizer.advance(1);
Delim(c)
}
},
};
Some(token)
}
fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
match consume_quoted_string(tokenizer, single_quote) {
Ok(value) => QuotedString(value),
Err(()) => BadString
}
}
fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
-> Result<Cow<'a, str>, ()> {
tokenizer.advance(1); let start_pos = tokenizer.position();
let mut string;
loop {
if tokenizer.is_eof() {
return Ok(Borrowed(tokenizer.slice_from(start_pos)))
}
match tokenizer.next_char() {
'"' if !single_quote => {
let value = tokenizer.slice_from(start_pos);
tokenizer.advance(1);
return Ok(Borrowed(value))
}
'\'' if single_quote => {
let value = tokenizer.slice_from(start_pos);
tokenizer.advance(1);
return Ok(Borrowed(value))
}
'\\' | '\0' => {
string = tokenizer.slice_from(start_pos).to_owned();
break
}
'\n' | '\r' | '\x0C' => return Err(()),
_ => {
tokenizer.consume_char();
}
}
}
while !tokenizer.is_eof() {
if matches!(tokenizer.next_char(), '\n' | '\r' | '\x0C') {
return Err(());
}
match tokenizer.consume_char() {
'"' if !single_quote => break,
'\'' if single_quote => break,
'\\' => {
if !tokenizer.is_eof() {
match tokenizer.next_char() {
'\n' | '\x0C' => tokenizer.advance(1),
'\r' => {
tokenizer.advance(1);
if !tokenizer.is_eof() && tokenizer.next_char() == '\n' {
tokenizer.advance(1);
}
}
_ => string.push(consume_escape(tokenizer))
}
}
}
'\0' => string.push('\u{FFFD}'),
c => string.push(c),
}
}
Ok(Owned(string))
}
#[inline]
fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
!tokenizer.is_eof() && match tokenizer.next_char() {
'a'...'z' | 'A'...'Z' | '_' | '\0' => true,
'-' => tokenizer.has_at_least(1) && match tokenizer.char_at(1) {
'a'...'z' | 'A'...'Z' | '-' | '_' | '\0' => true,
'\\' => !tokenizer.has_newline_at(1),
c => c > '\x7F', },
'\\' => !tokenizer.has_newline_at(1),
c => c > '\x7F', }
}
fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
let value = consume_name(tokenizer);
if !tokenizer.is_eof() && tokenizer.next_char() == '(' {
tokenizer.advance(1);
if value.eq_ignore_ascii_case("url") {
consume_url(tokenizer)
} else {
if tokenizer.var_functions == VarFunctions::LookingForThem &&
value.eq_ignore_ascii_case("var") {
tokenizer.var_functions = VarFunctions::SeenAtLeastOne;
}
Function(value)
}
} else {
Ident(value)
}
}
fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
let start_pos = tokenizer.position();
let mut value;
loop {
if tokenizer.is_eof() {
return Borrowed(tokenizer.slice_from(start_pos))
}
match tokenizer.next_char() {
'a'...'z' | 'A'...'Z' | '0'...'9' | '_' | '-' => tokenizer.advance(1),
'\\' | '\0' => {
value = tokenizer.slice_from(start_pos).to_owned();
break
}
c if c.is_ascii() => return Borrowed(tokenizer.slice_from(start_pos)),
_ => {
tokenizer.consume_char();
}
}
}
while !tokenizer.is_eof() {
let c = tokenizer.next_char();
value.push(match c {
'a'...'z' | 'A'...'Z' | '0'...'9' | '_' | '-' => {
tokenizer.advance(1);
c
}
'\\' => {
if tokenizer.has_newline_at(1) { break }
tokenizer.advance(1);
consume_escape(tokenizer)
}
'\0' => { tokenizer.advance(1); '\u{FFFD}' },
c if c.is_ascii() => break,
_ => tokenizer.consume_char(),
})
}
Owned(value)
}
fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
let (has_sign, sign) = match tokenizer.next_char() {
'-' => (true, -1.),
'+' => (true, 1.),
_ => (false, 1.),
};
if has_sign {
tokenizer.advance(1);
}
let mut integral_part: f64 = 0.;
while let Some(digit) = tokenizer.next_char().to_digit(10) {
integral_part = integral_part * 10. + digit as f64;
tokenizer.advance(1);
if tokenizer.is_eof() {
break
}
}
let mut is_integer = true;
let mut fractional_part: f64 = 0.;
if tokenizer.has_at_least(1) && tokenizer.next_char() == '.'
&& matches!(tokenizer.char_at(1), '0'...'9') {
is_integer = false;
tokenizer.advance(1); let mut factor = 0.1;
while let Some(digit) = tokenizer.next_char().to_digit(10) {
fractional_part += digit as f64 * factor;
factor *= 0.1;
tokenizer.advance(1);
if tokenizer.is_eof() {
break
}
}
}
let mut value = sign * (integral_part + fractional_part);
if (
tokenizer.has_at_least(1)
&& matches!(tokenizer.next_char(), 'e' | 'E')
&& matches!(tokenizer.char_at(1), '0'...'9')
) || (
tokenizer.has_at_least(2)
&& matches!(tokenizer.next_char(), 'e' | 'E')
&& matches!(tokenizer.char_at(1), '+' | '-')
&& matches!(tokenizer.char_at(2), '0'...'9')
) {
is_integer = false;
tokenizer.advance(1);
let (has_sign, sign) = match tokenizer.next_char() {
'-' => (true, -1.),
'+' => (true, 1.),
_ => (false, 1.),
};
if has_sign {
tokenizer.advance(1);
}
let mut exponent: f64 = 0.;
while let Some(digit) = tokenizer.next_char().to_digit(10) {
exponent = exponent * 10. + digit as f64;
tokenizer.advance(1);
if tokenizer.is_eof() {
break
}
}
value *= f64::powf(10., sign * exponent);
}
let int_value = if is_integer {
Some(if value >= i32::MAX as f64 {
i32::MAX
} else if value <= i32::MIN as f64 {
i32::MIN
} else {
value as i32
})
} else {
None
};
if !tokenizer.is_eof() && tokenizer.next_char() == '%' {
tokenizer.advance(1);
return Percentage(PercentageValue {
unit_value: value as f32 / 100.,
int_value: int_value,
has_sign: has_sign,
})
}
let value = NumericValue {
value: value as f32,
int_value: int_value,
has_sign: has_sign,
};
if is_ident_start(tokenizer) {
Dimension(value, consume_name(tokenizer))
} else {
Number(value)
}
}
fn consume_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
while !tokenizer.is_eof() {
match tokenizer.next_char() {
' ' | '\t' | '\n' | '\r' | '\x0C' => tokenizer.advance(1),
'"' => return consume_quoted_url(tokenizer, false),
'\'' => return consume_quoted_url(tokenizer, true),
')' => { tokenizer.advance(1); break },
_ => return consume_unquoted_url(tokenizer),
}
}
return Url(Borrowed(""));
fn consume_quoted_url<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
match consume_quoted_string(tokenizer, single_quote) {
Ok(value) => consume_url_end(tokenizer, value),
Err(()) => consume_bad_url(tokenizer),
}
}
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
let start_pos = tokenizer.position();
let mut string;
loop {
if tokenizer.is_eof() {
return Url(Borrowed(tokenizer.slice_from(start_pos)))
}
match tokenizer.next_char() {
' ' | '\t' | '\n' | '\r' | '\x0C' => {
let value = tokenizer.slice_from(start_pos);
tokenizer.advance(1);
return consume_url_end(tokenizer, Borrowed(value))
}
')' => {
let value = tokenizer.slice_from(start_pos);
tokenizer.advance(1);
return Url(Borrowed(value))
}
'\x01'...'\x08' | '\x0B' | '\x0E'...'\x1F' | '\x7F' | '"' | '\'' | '(' => {
tokenizer.advance(1);
return consume_bad_url(tokenizer)
},
'\\' | '\0' => {
string = tokenizer.slice_from(start_pos).to_owned();
break
}
_ => {
tokenizer.consume_char();
}
}
}
while !tokenizer.is_eof() {
let next_char = match tokenizer.consume_char() {
' ' | '\t' | '\n' | '\r' | '\x0C' => {
return consume_url_end(tokenizer, Owned(string))
}
')' => break,
'\x01'...'\x08' | '\x0B' | '\x0E'...'\x1F' | '\x7F' | '"' | '\'' | '(' => return consume_bad_url(tokenizer),
'\\' => {
if tokenizer.has_newline_at(0) {
return consume_bad_url(tokenizer)
}
consume_escape(tokenizer)
},
'\0' => '\u{FFFD}',
c => c
};
string.push(next_char)
}
Url(Owned(string))
}
fn consume_url_end<'a>(tokenizer: &mut Tokenizer<'a>, string: Cow<'a, str>) -> Token<'a> {
while !tokenizer.is_eof() {
match tokenizer.consume_char() {
' ' | '\t' | '\n' | '\r' | '\x0C' => (),
')' => break,
_ => return consume_bad_url(tokenizer)
}
}
Url(string)
}
fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
while !tokenizer.is_eof() {
match tokenizer.consume_char() {
')' => break,
'\\' => tokenizer.advance(1), _ => ()
}
}
BadUrl
}
}
fn consume_unicode_range<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
tokenizer.advance(2); let (hex_value, hex_digits) = consume_hex_digits(tokenizer);
let max_question_marks = 6 - hex_digits;
let mut question_marks = 0;
while question_marks < max_question_marks && !tokenizer.is_eof()
&& tokenizer.next_char() == '?' {
question_marks += 1;
tokenizer.advance(1)
}
let start;
let end;
if question_marks > 0 {
start = hex_value << (question_marks * 4);
end = ((hex_value + 1) << (question_marks * 4)) - 1;
} else {
start = hex_value;
if tokenizer.has_at_least(1) &&
tokenizer.next_char() == '-' &&
matches!(tokenizer.char_at(1), '0'...'9' | 'A'...'F' | 'a'...'f') {
tokenizer.advance(1);
let (hex_value, _) = consume_hex_digits(tokenizer);
end = hex_value;
} else {
end = start;
}
}
UnicodeRange(start, end)
}
fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
let mut value = 0;
let mut digits = 0;
while digits < 6 && !tokenizer.is_eof() {
match tokenizer.next_char().to_digit(16) {
Some(digit) => {
value = value * 16 + digit;
digits += 1;
tokenizer.advance(1);
}
None => break
}
}
(value, digits)
}
fn consume_escape(tokenizer: &mut Tokenizer) -> char {
if tokenizer.is_eof() { return '\u{FFFD}' } match tokenizer.next_char() {
'0'...'9' | 'A'...'F' | 'a'...'f' => {
let (c, _) = consume_hex_digits(tokenizer);
if !tokenizer.is_eof() {
match tokenizer.next_char() {
' ' | '\t' | '\n' | '\x0C' => tokenizer.advance(1),
'\r' => {
tokenizer.advance(1);
if !tokenizer.is_eof() && tokenizer.next_char() == '\n' {
tokenizer.advance(1);
}
}
_ => ()
}
}
static REPLACEMENT_CHAR: char = '\u{FFFD}';
if c != 0 {
let c = char::from_u32(c);
c.unwrap_or(REPLACEMENT_CHAR)
} else {
REPLACEMENT_CHAR
}
},
'\0' => {
tokenizer.advance(1);
'\u{FFFD}'
}
_ => tokenizer.consume_char()
}
}