css3_selector/
token.rs

1use regex::Regex;
2use std::error::Error;
3use std::fmt;
4
5/// https://www.w3.org/TR/2018/REC-selectors-3-20181106/#lex
6#[derive(Debug, Clone)]
7pub enum Token<'a> {
8    S,
9    INCLUDES,
10    DASHMATCH,
11    PREFIXMATCH,
12    SUFFIXMATCH,
13    SUBSTRINGMATCH,
14    IDENT(&'a str),
15    STRING(&'a str),
16    FUNCTION(&'a str),
17    NUM(&'a str),
18    HASH(&'a str),
19    PLUS,
20    GREATER,
21    COMMA,
22    TILDE,
23    NOT,
24    ATKEYWORD(&'a str),
25    INVALID(&'a str),
26    PERCENTAGE(&'a str),
27    DIMENSION(&'a str, &'a str),
28    CDO,
29    CDC,
30    COMMENTS(&'a str),
31    COLON, // non-standard to aid in parsing follows
32    DOT(&'a str),
33    STAR,
34    PIPE,
35    SUB,
36    LBRACK,
37    RBRACK,
38    EQ,
39    RPAREN,
40}
41
42#[derive(Debug, Clone)]
43pub struct Span<'a> {
44    pub start: usize,
45    pub stop: usize,
46    pub value: &'a str,
47}
48
49pub type TokenSpan<'a> = (Span<'a>, Token<'a>);
50
51pub struct Lexer<'a> {
52    pub src: &'a str,
53    pub tokens: Vec<TokenSpan<'a>>,
54}
55
56#[derive(Debug, Clone)]
57pub struct LexerError<'a> {
58    pub span: Span<'a>,
59    pub message: String,
60}
61
62impl<'a> fmt::Display for LexerError<'a> {
63    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
64        write!(
65            f,
66            "failed to lex @ {}:{} ('{}'): {}",
67            self.span.start, self.span.stop, self.span.value, self.message
68        )
69    }
70}
71
72impl<'a> Error for LexerError<'a> {}
73
74lazy_static! {
75    static ref S: Regex = Regex::new(r"(?i)^[ \t\r\n\f]+").unwrap();
76    static ref IDENT: Regex = Regex::new(r"(?i)^[-]?(?:[_a-z]|[^\x00-\x7F]|\\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?|\\[^\n\r\f0-9a-f])(?:[_a-z0-9-]|[^\x00-\x7F]|\\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?|\\[^\n\r\f0-9a-f])*").unwrap();
77    static ref STRING: Regex = Regex::new(r#"(?i)^(?:"((?:[^\n\r\f\\"]|\\\n|\r\n|\r|\f|[^\x00-\x7F]|\\[0-9a-f]{1,6}(?:\r\n|[ \n\r\t\f])?|\\[^\n\r\f0-9a-f])*)")|(?:'((?:[^\n\r\f\\"]|\\\n|\r\n|\r|\f|[^\x00-\x7F]|\\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?|\\[^\n\r\f0-9a-f])*)')"#).unwrap();
78    static ref NUMBER: Regex = Regex::new(r"(?i)^(?:[0-9]+|[0-9]*\.[0-9]+)").unwrap();
79    static ref PLUS_GREATER_COMMA_TILDE: Regex = Regex::new(r"(?i)^[ \t\r\n\f]*(\+|>|,|~)").unwrap();
80    static ref NOT: Regex = Regex::new(r#"(?i)^:(?:n|\\0{0,4}(4e|6e)(\r\n|[ \t\r\n\f])?|\\n)(?:o|\\0{0,4}(4f|6f)(\r\n|[ \t\r\n\f])?|\\o)(?:t|\\0{0,4}(54|74)(\r\n|[ \t\r\n\f])?|\\t)\("#).unwrap();
81    static ref COMMENTS: Regex = Regex::new(r"(?i)^/\*[^*]*\*+([^/*][^*]*\*+)*/").unwrap();
82}
83
84fn match_ident<'a>(src: &'a str) -> Option<&'a str> {
85    IDENT.find(src).map(|ident| {
86        assert_eq!(ident.start(), 0);
87        &src[..ident.end()]
88    })
89}
90
91fn match_token_raw<'a>(src: &mut &'a str) -> Result<Token<'a>, String> {
92    if src.starts_with("~=") {
93        *src = &src[2..];
94        Ok(Token::INCLUDES)
95    } else if src.starts_with("|=") {
96        *src = &src[2..];
97        Ok(Token::DASHMATCH)
98    } else if src.starts_with("^=") {
99        *src = &src[2..];
100        Ok(Token::PREFIXMATCH)
101    } else if src.starts_with("^=") {
102        *src = &src[2..];
103        Ok(Token::PREFIXMATCH)
104    } else if src.starts_with("$=") {
105        *src = &src[2..];
106        Ok(Token::SUFFIXMATCH)
107    } else if src.starts_with("*=") {
108        *src = &src[2..];
109        Ok(Token::SUBSTRINGMATCH)
110    } else if let Some(ident) = IDENT.find(src) {
111        assert_eq!(ident.start(), 0);
112        let span = &src[0..ident.end()];
113        *src = &src[ident.end()..];
114        if src.starts_with("(") {
115            *src = &src[1..];
116            Ok(Token::FUNCTION(span))
117        } else {
118            Ok(Token::IDENT(span))
119        }
120    } else if let Some(string) = STRING.captures(src) {
121        let internal = string.get(1).unwrap();
122        let string = string.get(0).unwrap();
123        assert_eq!(string.start(), 0);
124        let span = &src[internal.start()..internal.end()];
125        *src = &src[string.end()..];
126        Ok(Token::STRING(span))
127    } else if let Some(num) = NUMBER.find(src) {
128        assert_eq!(num.start(), 0);
129        let span = &src[0..num.end()];
130        *src = &src[num.end()..];
131        if src.starts_with("%") {
132            *src = &src[1..];
133            Ok(Token::PERCENTAGE(span))
134        } else if let Some(ident) = IDENT.find(src).map(|ident| {
135            assert_eq!(ident.start(), 0);
136            let span = &src[..ident.end()];
137            *src = &src[ident.end()..];
138            span
139        }) {
140            Ok(Token::DIMENSION(span, ident))
141        } else {
142            Ok(Token::NUM(span))
143        }
144    } else if src.starts_with("#") {
145        *src = &src[1..];
146        let ident = match_ident(*src);
147        if ident.is_none() {
148            return Err("expected ident after '#'".to_string());
149        }
150        let ident = ident.unwrap();
151        *src = &src[ident.len()..];
152        Ok(Token::HASH(ident))
153    } else if let Some(result) = PLUS_GREATER_COMMA_TILDE.captures(src) {
154        let item = result.get(1).unwrap();
155        let result = result.get(0).unwrap();
156        assert_eq!(result.start(), 0);
157        *src = &src[result.end()..];
158        match item.as_str() {
159            "+" => Ok(Token::PLUS),
160            ">" => Ok(Token::GREATER),
161            "," => Ok(Token::COMMA),
162            "~" => Ok(Token::TILDE),
163            x => panic!("unexpected regex match group value: {}", x),
164        }
165    } else if let Some(not) = NOT.find(src) {
166        assert_eq!(not.start(), 0);
167        *src = &src[not.end()..];
168        Ok(Token::NOT)
169    } else if src.starts_with("@") {
170        *src = &src[1..];
171        let ident = match_ident(*src);
172        if ident.is_none() {
173            return Err("expected ident after '@'".to_string());
174        }
175        let ident = ident.unwrap();
176        *src = &src[ident.len()..];
177        Ok(Token::ATKEYWORD(ident))
178    } else if let Some(s) = S.find(src) {
179        assert_eq!(s.start(), 0);
180        *src = &src[s.end()..];
181        Ok(Token::S)
182    } else if src.starts_with("<!--") {
183        *src = &src[4..];
184        Ok(Token::CDO)
185    } else if src.starts_with("-->") {
186        *src = &src[3..];
187        Ok(Token::CDC)
188    } else if src.starts_with(":") {
189        *src = &src[1..];
190        Ok(Token::COLON)
191    } else if src.starts_with(".") {
192        *src = &src[1..];
193        let ident = match_ident(*src);
194        if ident.is_none() {
195            return Err("expected ident after '.'".to_string());
196        }
197        let ident = ident.unwrap();
198        *src = &src[ident.len()..];
199        Ok(Token::DOT(ident))
200    } else if src.starts_with("*") {
201        *src = &src[1..];
202        Ok(Token::STAR)
203    } else if src.starts_with("|") {
204        *src = &src[1..];
205        Ok(Token::PIPE)
206    } else if src.starts_with("-") {
207        *src = &src[1..];
208        Ok(Token::SUB)
209    } else if src.starts_with("[") {
210        *src = &src[1..];
211        Ok(Token::LBRACK)
212    } else if src.starts_with("]") {
213        *src = &src[1..];
214        Ok(Token::RBRACK)
215    } else if src.starts_with("=") {
216        *src = &src[1..];
217        Ok(Token::EQ)
218    } else if src.starts_with(")") {
219        *src = &src[1..];
220        Ok(Token::RPAREN)
221    } else if let Some(comments) = COMMENTS.find(src) {
222        assert_eq!(comments.start(), 0);
223        let span = &src[0..comments.end()];
224        *src = &src[comments.end()..];
225        Ok(Token::COMMENTS(span))
226    } else if let Some(s) = S.find(src) {
227        assert_eq!(s.start(), 0);
228        *src = &src[s.end()..];
229        Ok(Token::S)
230    } else {
231        if src.len() > 0 {
232            *src = &src[1..];
233        }
234        Err("no valid token matching".to_string())
235    }
236}
237
238fn match_token<'a>(index: usize, src: &mut &'a str) -> Result<TokenSpan<'a>, LexerError<'a>> {
239    let mut new_src = *src;
240    let result = match_token_raw(&mut new_src);
241    let span = Span::<'a> {
242        start: index,
243        stop: index + (src.len() - new_src.len()),
244        value: &src[0..(src.len() - new_src.len())],
245    };
246    match result {
247        Ok(token) => {
248            *src = new_src;
249            Ok((span, token))
250        }
251        Err(message) => Err(LexerError::<'a> { span, message }),
252    }
253}
254
255impl<'a> Lexer<'a> {
256    pub fn parse(src: &'a str) -> Result<Lexer<'a>, LexerError<'a>> {
257        let mut index: usize = 0;
258        let mut new_src = src;
259        let mut tokens: Vec<TokenSpan> = vec![];
260        while new_src.len() > 0 {
261            match match_token(index, &mut new_src) {
262                Ok((span, token)) => {
263                    index += span.stop - span.start;
264                    tokens.push((span, token));
265                }
266                Err(e) => {
267                    return Err(e);
268                }
269            }
270        }
271        Ok(Lexer::<'a> { src, tokens })
272    }
273}
274
275#[cfg(test)]
276mod test {
277    use super::*;
278    use crate::PASS_SELECTORS;
279
280    #[test]
281    fn pass_tests() {
282        for test in PASS_SELECTORS.iter() {
283            if let Err(e) = Lexer::parse(test) {
284                panic!("failed to lex {}: {:?}", test, e);
285            }
286        }
287    }
288}