1use regex::Regex;
2use std::error::Error;
3use std::fmt;
4
5#[derive(Debug, Clone)]
7pub enum Token<'a> {
8 S,
9 INCLUDES,
10 DASHMATCH,
11 PREFIXMATCH,
12 SUFFIXMATCH,
13 SUBSTRINGMATCH,
14 IDENT(&'a str),
15 STRING(&'a str),
16 FUNCTION(&'a str),
17 NUM(&'a str),
18 HASH(&'a str),
19 PLUS,
20 GREATER,
21 COMMA,
22 TILDE,
23 NOT,
24 ATKEYWORD(&'a str),
25 INVALID(&'a str),
26 PERCENTAGE(&'a str),
27 DIMENSION(&'a str, &'a str),
28 CDO,
29 CDC,
30 COMMENTS(&'a str),
31 COLON, DOT(&'a str),
33 STAR,
34 PIPE,
35 SUB,
36 LBRACK,
37 RBRACK,
38 EQ,
39 RPAREN,
40}
41
42#[derive(Debug, Clone)]
43pub struct Span<'a> {
44 pub start: usize,
45 pub stop: usize,
46 pub value: &'a str,
47}
48
49pub type TokenSpan<'a> = (Span<'a>, Token<'a>);
50
51pub struct Lexer<'a> {
52 pub src: &'a str,
53 pub tokens: Vec<TokenSpan<'a>>,
54}
55
56#[derive(Debug, Clone)]
57pub struct LexerError<'a> {
58 pub span: Span<'a>,
59 pub message: String,
60}
61
62impl<'a> fmt::Display for LexerError<'a> {
63 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
64 write!(
65 f,
66 "failed to lex @ {}:{} ('{}'): {}",
67 self.span.start, self.span.stop, self.span.value, self.message
68 )
69 }
70}
71
72impl<'a> Error for LexerError<'a> {}
73
74lazy_static! {
75 static ref S: Regex = Regex::new(r"(?i)^[ \t\r\n\f]+").unwrap();
76 static ref IDENT: Regex = Regex::new(r"(?i)^[-]?(?:[_a-z]|[^\x00-\x7F]|\\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?|\\[^\n\r\f0-9a-f])(?:[_a-z0-9-]|[^\x00-\x7F]|\\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?|\\[^\n\r\f0-9a-f])*").unwrap();
77 static ref STRING: Regex = Regex::new(r#"(?i)^(?:"((?:[^\n\r\f\\"]|\\\n|\r\n|\r|\f|[^\x00-\x7F]|\\[0-9a-f]{1,6}(?:\r\n|[ \n\r\t\f])?|\\[^\n\r\f0-9a-f])*)")|(?:'((?:[^\n\r\f\\"]|\\\n|\r\n|\r|\f|[^\x00-\x7F]|\\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?|\\[^\n\r\f0-9a-f])*)')"#).unwrap();
78 static ref NUMBER: Regex = Regex::new(r"(?i)^(?:[0-9]+|[0-9]*\.[0-9]+)").unwrap();
79 static ref PLUS_GREATER_COMMA_TILDE: Regex = Regex::new(r"(?i)^[ \t\r\n\f]*(\+|>|,|~)").unwrap();
80 static ref NOT: Regex = Regex::new(r#"(?i)^:(?:n|\\0{0,4}(4e|6e)(\r\n|[ \t\r\n\f])?|\\n)(?:o|\\0{0,4}(4f|6f)(\r\n|[ \t\r\n\f])?|\\o)(?:t|\\0{0,4}(54|74)(\r\n|[ \t\r\n\f])?|\\t)\("#).unwrap();
81 static ref COMMENTS: Regex = Regex::new(r"(?i)^/\*[^*]*\*+([^/*][^*]*\*+)*/").unwrap();
82}
83
84fn match_ident<'a>(src: &'a str) -> Option<&'a str> {
85 IDENT.find(src).map(|ident| {
86 assert_eq!(ident.start(), 0);
87 &src[..ident.end()]
88 })
89}
90
91fn match_token_raw<'a>(src: &mut &'a str) -> Result<Token<'a>, String> {
92 if src.starts_with("~=") {
93 *src = &src[2..];
94 Ok(Token::INCLUDES)
95 } else if src.starts_with("|=") {
96 *src = &src[2..];
97 Ok(Token::DASHMATCH)
98 } else if src.starts_with("^=") {
99 *src = &src[2..];
100 Ok(Token::PREFIXMATCH)
101 } else if src.starts_with("^=") {
102 *src = &src[2..];
103 Ok(Token::PREFIXMATCH)
104 } else if src.starts_with("$=") {
105 *src = &src[2..];
106 Ok(Token::SUFFIXMATCH)
107 } else if src.starts_with("*=") {
108 *src = &src[2..];
109 Ok(Token::SUBSTRINGMATCH)
110 } else if let Some(ident) = IDENT.find(src) {
111 assert_eq!(ident.start(), 0);
112 let span = &src[0..ident.end()];
113 *src = &src[ident.end()..];
114 if src.starts_with("(") {
115 *src = &src[1..];
116 Ok(Token::FUNCTION(span))
117 } else {
118 Ok(Token::IDENT(span))
119 }
120 } else if let Some(string) = STRING.captures(src) {
121 let internal = string.get(1).unwrap();
122 let string = string.get(0).unwrap();
123 assert_eq!(string.start(), 0);
124 let span = &src[internal.start()..internal.end()];
125 *src = &src[string.end()..];
126 Ok(Token::STRING(span))
127 } else if let Some(num) = NUMBER.find(src) {
128 assert_eq!(num.start(), 0);
129 let span = &src[0..num.end()];
130 *src = &src[num.end()..];
131 if src.starts_with("%") {
132 *src = &src[1..];
133 Ok(Token::PERCENTAGE(span))
134 } else if let Some(ident) = IDENT.find(src).map(|ident| {
135 assert_eq!(ident.start(), 0);
136 let span = &src[..ident.end()];
137 *src = &src[ident.end()..];
138 span
139 }) {
140 Ok(Token::DIMENSION(span, ident))
141 } else {
142 Ok(Token::NUM(span))
143 }
144 } else if src.starts_with("#") {
145 *src = &src[1..];
146 let ident = match_ident(*src);
147 if ident.is_none() {
148 return Err("expected ident after '#'".to_string());
149 }
150 let ident = ident.unwrap();
151 *src = &src[ident.len()..];
152 Ok(Token::HASH(ident))
153 } else if let Some(result) = PLUS_GREATER_COMMA_TILDE.captures(src) {
154 let item = result.get(1).unwrap();
155 let result = result.get(0).unwrap();
156 assert_eq!(result.start(), 0);
157 *src = &src[result.end()..];
158 match item.as_str() {
159 "+" => Ok(Token::PLUS),
160 ">" => Ok(Token::GREATER),
161 "," => Ok(Token::COMMA),
162 "~" => Ok(Token::TILDE),
163 x => panic!("unexpected regex match group value: {}", x),
164 }
165 } else if let Some(not) = NOT.find(src) {
166 assert_eq!(not.start(), 0);
167 *src = &src[not.end()..];
168 Ok(Token::NOT)
169 } else if src.starts_with("@") {
170 *src = &src[1..];
171 let ident = match_ident(*src);
172 if ident.is_none() {
173 return Err("expected ident after '@'".to_string());
174 }
175 let ident = ident.unwrap();
176 *src = &src[ident.len()..];
177 Ok(Token::ATKEYWORD(ident))
178 } else if let Some(s) = S.find(src) {
179 assert_eq!(s.start(), 0);
180 *src = &src[s.end()..];
181 Ok(Token::S)
182 } else if src.starts_with("<!--") {
183 *src = &src[4..];
184 Ok(Token::CDO)
185 } else if src.starts_with("-->") {
186 *src = &src[3..];
187 Ok(Token::CDC)
188 } else if src.starts_with(":") {
189 *src = &src[1..];
190 Ok(Token::COLON)
191 } else if src.starts_with(".") {
192 *src = &src[1..];
193 let ident = match_ident(*src);
194 if ident.is_none() {
195 return Err("expected ident after '.'".to_string());
196 }
197 let ident = ident.unwrap();
198 *src = &src[ident.len()..];
199 Ok(Token::DOT(ident))
200 } else if src.starts_with("*") {
201 *src = &src[1..];
202 Ok(Token::STAR)
203 } else if src.starts_with("|") {
204 *src = &src[1..];
205 Ok(Token::PIPE)
206 } else if src.starts_with("-") {
207 *src = &src[1..];
208 Ok(Token::SUB)
209 } else if src.starts_with("[") {
210 *src = &src[1..];
211 Ok(Token::LBRACK)
212 } else if src.starts_with("]") {
213 *src = &src[1..];
214 Ok(Token::RBRACK)
215 } else if src.starts_with("=") {
216 *src = &src[1..];
217 Ok(Token::EQ)
218 } else if src.starts_with(")") {
219 *src = &src[1..];
220 Ok(Token::RPAREN)
221 } else if let Some(comments) = COMMENTS.find(src) {
222 assert_eq!(comments.start(), 0);
223 let span = &src[0..comments.end()];
224 *src = &src[comments.end()..];
225 Ok(Token::COMMENTS(span))
226 } else if let Some(s) = S.find(src) {
227 assert_eq!(s.start(), 0);
228 *src = &src[s.end()..];
229 Ok(Token::S)
230 } else {
231 if src.len() > 0 {
232 *src = &src[1..];
233 }
234 Err("no valid token matching".to_string())
235 }
236}
237
238fn match_token<'a>(index: usize, src: &mut &'a str) -> Result<TokenSpan<'a>, LexerError<'a>> {
239 let mut new_src = *src;
240 let result = match_token_raw(&mut new_src);
241 let span = Span::<'a> {
242 start: index,
243 stop: index + (src.len() - new_src.len()),
244 value: &src[0..(src.len() - new_src.len())],
245 };
246 match result {
247 Ok(token) => {
248 *src = new_src;
249 Ok((span, token))
250 }
251 Err(message) => Err(LexerError::<'a> { span, message }),
252 }
253}
254
255impl<'a> Lexer<'a> {
256 pub fn parse(src: &'a str) -> Result<Lexer<'a>, LexerError<'a>> {
257 let mut index: usize = 0;
258 let mut new_src = src;
259 let mut tokens: Vec<TokenSpan> = vec![];
260 while new_src.len() > 0 {
261 match match_token(index, &mut new_src) {
262 Ok((span, token)) => {
263 index += span.stop - span.start;
264 tokens.push((span, token));
265 }
266 Err(e) => {
267 return Err(e);
268 }
269 }
270 }
271 Ok(Lexer::<'a> { src, tokens })
272 }
273}
274
275#[cfg(test)]
276mod test {
277 use super::*;
278 use crate::PASS_SELECTORS;
279
280 #[test]
281 fn pass_tests() {
282 for test in PASS_SELECTORS.iter() {
283 if let Err(e) = Lexer::parse(test) {
284 panic!("failed to lex {}: {:?}", test, e);
285 }
286 }
287 }
288}