1use crate::ref_ring::RefRing;
2use memchr::memchr;
3use memchr::memmem::Finder;
4use once_cell::sync::Lazy;
5use std::cell::RefCell;
6use std::clone::Clone;
7use std::cmp::PartialEq;
8use std::cmp::{min, Eq};
9
10const SINGLE_QUOTE: char = '\'';
11const DOUBLE_QUOTE: char = '"';
12const BACKSLASH: char = '\\';
13const SLASH: char = '/';
14const NEWLINE: char = '\n';
15const SPACE: char = ' ';
16const FEED: char = '\u{12}'; const TAB: char = '\t';
18const CR: char = '\r';
19const OPEN_SQUARE: char = '[';
20const CLOSE_SQUARE: char = ']';
21const OPEN_PARENTHESES: char = '(';
22const CLOSE_PARENTHESES: char = ')';
23const OPEN_CURLY: char = '{';
24const CLOSE_CURLY: char = '}';
25const SEMICOLON: char = ';';
26const ASTERISK: char = '*';
27const COLON: char = ':';
28const AT: char = '@';
29
30const MAX_BUFFER: usize = 102400;
31
32static FINDER_END_OF_COMMENT: Lazy<Finder<'static>> = Lazy::new(|| Finder::new("*/"));
33
34#[derive(Debug, Clone, Eq, PartialEq)]
35pub enum TokenType {
36 OpenParentheses,
37 CloseParentheses,
38 Space,
39 Word,
40 String,
41 OpenSquare,
42 CloseSquare,
43 OpenCurly,
44 CloseCurly,
45 Semicolon,
46 Colon,
47 Comment,
48 AtWord,
49 Brackets,
50 Unknown,
51}
52
53#[derive(Debug, Clone, Eq, PartialEq)]
54pub struct Token<'a>(
55 pub TokenType,
56 pub &'a str,
57 pub Option<usize>,
58 pub Option<usize>,
59);
60
61impl<'a> Token<'a> {
62 pub fn new(kind: TokenType, content: &'a str, pos: Option<usize>, next: Option<usize>) -> Token {
63 Token(kind, content, pos, next)
64 }
65}
66
67#[derive(Debug)]
68pub struct Tokenizer<'a> {
69 css: &'a str,
70 ignore: bool,
71 length: usize,
72 pos: RefCell<usize>,
73 buffer: RefCell<RefRing<'a>>,
74 returned: RefCell<Vec<Token<'a>>>,
75 rope: Option<ropey::Rope>,
76}
77
78impl<'a> Tokenizer<'a> {
79 pub fn new(source_code: &'a str, ignore_errors: bool) -> Tokenizer<'a> {
80 let length = source_code.len();
81 Tokenizer {
82 css: source_code,
83 ignore: ignore_errors,
84 length,
85 pos: RefCell::new(0),
86 buffer: RefCell::new(Default::default()),
87 returned: RefCell::new(Vec::with_capacity(min(MAX_BUFFER, length / 8))),
88 rope: None,
89 }
90 }
91
92 #[inline]
93 fn push(&self, t: &'a str) {
94 self.buffer.borrow_mut().push(t);
95 }
96
97 #[inline]
98 pub fn position(&self) -> usize {
99 *self.pos.borrow()
100 }
101
102 pub fn unclosed(&self, what: &str) {
103 panic!("Unclosed {} {}", what, self.position());
104 }
105
106 pub fn end_of_file(&self) -> bool {
107 self.returned.borrow().is_empty() && self.position() >= self.length
108 }
109
110 pub fn back(&self, token: Token<'a>) {
111 self.returned.borrow_mut().push(token);
112 }
113
114 #[inline]
115 fn pos_plus_one(&self) {
116 self.pos.replace_with(|it| *it + 1);
117 }
118
119 pub fn next_token(&self, ignore_unclosed: bool) -> Token<'a> {
120 if !self.returned.borrow().is_empty() {
121 return self.returned.borrow_mut().pop().unwrap();
122 }
123
124 let mut code = char_code_at(self.css, self.position());
125
126 let current_token: Token;
127
128 match code {
129 NEWLINE | SPACE | TAB | CR | FEED => {
130 let mut next = self.position();
131 loop {
132 next += 1;
133 code = char_code_at(self.css, next);
134 if !(code == SPACE || code == NEWLINE || code == TAB || code == FEED) {
135 break;
136 }
137 }
138
139 current_token = Token(
140 TokenType::Space,
141 self.css[self.position()..next].into(),
142 None,
143 None,
144 );
145
146 self.pos.replace(next);
147 }
148 OPEN_SQUARE | CLOSE_SQUARE | OPEN_CURLY | CLOSE_CURLY | COLON | SEMICOLON
149 | CLOSE_PARENTHESES => {
150 current_token = Token(
151 get_token_type(code),
152 get_str(code),
153 Some(self.position()),
154 None,
155 );
156 self.pos_plus_one();
157 }
158 OPEN_PARENTHESES => {
159 let prev = self.buffer.borrow_mut().pop().unwrap_or("");
160 let n = char_code_at(self.css, self.position() + 1);
161 if prev == "url"
162 && n != SINGLE_QUOTE
163 && n != DOUBLE_QUOTE
164 && n != SPACE
165 && n != NEWLINE
166 && n != TAB
167 && n != FEED
168 && n != CR
169 {
170 let mut next = self.position();
171 loop {
172 let mut escaped = false;
173 match index_of_byte(self.css, b')', next + 1) {
174 Some(i) => {
175 next = i;
176 }
177 None => {
178 if self.ignore || ignore_unclosed {
179 next = self.position();
180 break;
181 } else {
182 self.unclosed("bracket")
183 }
184 }
185 }
186
187 let mut escape_pos = next;
188 while char_code_at(self.css, escape_pos - 1) == BACKSLASH {
189 escape_pos -= 1;
190 escaped = !escaped;
191 }
192
193 if !escaped {
194 break;
195 }
196 }
197
198 current_token = Token(
199 TokenType::Brackets,
200 sub_string(self.css, self.position(), next + 1),
201 Some(self.position()),
202 Some(next),
203 );
204
205 self.pos.replace(next + 1);
206 } else {
207 match index_of_byte(self.css, b')', self.position() + 1) {
208 Some(i) => {
209 let content = &self.css[self.position()..i + 1];
210
211 if is_bad_bracket(content) {
212 current_token = Token(TokenType::OpenParentheses, "(", Some(self.position()), None);
213 } else {
214 current_token = Token(TokenType::Brackets, content, Some(self.position()), Some(i));
215 self.pos.replace(i);
216 }
217 }
218 None => {
219 current_token = Token(TokenType::OpenParentheses, "(", Some(self.position()), None);
220 }
221 };
222 self.pos_plus_one();
223 }
224 }
225 SINGLE_QUOTE | DOUBLE_QUOTE => {
226 let quote = if code == SINGLE_QUOTE { b'\'' } else { b'"' };
227 let mut next = self.position();
228 loop {
229 let mut escaped = false;
230 match index_of_byte(self.css, quote, next + 1) {
231 Some(i) => {
232 next = i;
233 }
234 None => {
235 if self.ignore || ignore_unclosed {
236 next = self.position() + 1;
237 break;
238 } else {
239 self.unclosed("string")
240 }
241 }
242 }
243
244 let mut escape_pos = next;
245 while char_code_at(self.css, escape_pos - 1) == BACKSLASH {
246 escape_pos -= 1;
247 escaped = !escaped;
248 }
249
250 if !escaped {
251 break;
252 }
253 }
254
255 current_token = Token(
256 TokenType::String,
257 sub_string(self.css, self.position(), next + 1),
258 Some(self.position()),
259 Some(next),
260 );
261 self.pos.replace(next + 1);
262 }
263 AT => {
264 let next = index_of_at_end(self.css, self.position() + 1) - 1;
265 current_token = Token(
266 TokenType::AtWord,
267 sub_string(self.css, self.position(), next + 1),
268 Some(self.position()),
269 Some(next),
270 );
271 self.pos.replace(next + 1);
272 }
273 BACKSLASH => {
274 let mut next = self.position();
275 let mut escape = true;
276 while char_code_at(self.css, next + 1) == BACKSLASH {
277 next += 1;
278 escape = !escape;
279 }
280 code = char_code_at(self.css, next + 1);
281 if escape
282 && code != SLASH
283 && code != SPACE
284 && code != NEWLINE
285 && code != TAB
286 && code != CR
287 && code != FEED
288 {
289 next += 1;
290 if is_hex_char(self.css, next) {
291 while is_hex_char(self.css, next + 1) {
292 next += 1;
293 }
294 if char_code_at(self.css, next + 1) == SPACE {
295 next += 1;
296 }
297 }
298 }
299
300 current_token = Token(
301 TokenType::Word,
302 sub_string(self.css, self.position(), next + 1),
303 Some(self.position()),
304 Some(next),
305 );
306 self.pos.replace(next + 1);
307 }
308 _ => {
309 self.pos.replace(
310 if code == SLASH && char_code_at(self.css, self.position() + 1) == ASTERISK {
311 let next = match index_of_end_comment(self.css, self.position() + 2) {
312 Some(i) => i + 1,
313 None => {
314 if !self.ignore && !ignore_unclosed {
315 self.unclosed("comment");
316 }
317 self.length
318 }
319 };
320
321 current_token = Token(
322 TokenType::Comment,
323 sub_string(self.css, self.position(), next + 1),
324 Some(self.position()),
325 Some(next),
326 );
327 next
328 } else {
329 let next = index_of_word_end(self.css, self.position() + 1) - 1;
330 let content = sub_string(self.css, self.position(), next + 1);
331 current_token = Token::new(TokenType::Word, content, Some(self.position()), Some(next));
332 self.push(content);
333 next
334 },
335 );
336 self.pos_plus_one();
337 }
338 }
339
340 current_token
341 }
342
343 pub fn from_offset(&mut self, offset: usize) -> (usize, usize) {
345 let rope = if let Some(ref rope) = self.rope {
346 rope
347 } else {
348 self.rope = Some(ropey::Rope::from_str(self.css));
349 &self.rope.as_ref().unwrap()
350 };
351 let column = rope.byte_to_char(offset);
352 let line = rope.byte_to_line(offset);
353 (line, column)
354 }
355}
356
357#[inline]
358fn index_of_end_comment(value: &str, from_index: usize) -> Option<usize> {
359 let (_, last) = value.split_at(from_index);
360 FINDER_END_OF_COMMENT
361 .find(last.as_bytes())
362 .map(|v| v + from_index)
363}
364
365#[inline]
366fn index_of_byte(value: &str, search_value: u8, from_index: usize) -> Option<usize> {
367 let (_, last) = value.split_at(from_index);
368 memchr(search_value, last.as_bytes()).map(|v| v + from_index)
369}
370
371#[inline]
372fn sub_string(s: &str, start: usize, end: usize) -> &str {
373 if end + 1 > s.len() {
374 &s[start..]
375 } else {
376 &s[start..end]
377 }
378}
379
380#[inline]
381fn char_code_at(s: &str, n: usize) -> char {
382 if n >= s.len() {
383 '\0'
384 } else {
385 s.as_bytes()[n] as char
386 }
387}
388
389#[inline]
390fn is_hex_char(s: &str, n: usize) -> bool {
391 if n >= s.len() {
392 return false;
393 }
394
395 matches!(s.as_bytes()[n], b'A'..=b'F' | b'a'..=b'f' | b'0'..=b'9')
396}
397
398#[inline]
399fn is_bad_bracket(s: &str) -> bool {
400 let bytes = s.as_bytes();
401 #[allow(clippy::needless_range_loop)]
402 for i in 1..bytes.len() {
403 match bytes[i] as char {
404 '\n' | '"' | '\'' | '(' | '/' | '\\' => {
405 return true;
406 }
407 _ => continue,
408 };
409 }
410 false
411}
412
413#[inline]
414fn index_of_at_end(s: &str, start: usize) -> usize {
415 let bytes = s.as_bytes();
416 let mut i = start;
417 let len = bytes.len();
418
419 while i < len {
420 match bytes[i] as char {
421 '\t' | '\n' | '\u{12}' | '\r' | ' ' | '"' | '#' | '\'' | '(' | ')' | '/' | ';' | '['
422 | '\\' | ']' | '{' | '}' => {
423 return i;
424 }
425 _ => i += 1,
426 };
427 }
428
429 i
430}
431
432#[inline]
433fn index_of_word_end(s: &str, start: usize) -> usize {
434 let bytes = s.as_bytes();
435 let mut i = start;
436 let len = bytes.len();
437
438 while i < len {
439 match bytes[i] as char {
440 '\t' | '\n' | '\u{12}' | '\r' | ' ' | '!' | '"' | '#' | '\'' | '(' | ')' | ':' | ';'
441 | '@' | '[' | '\\' | ']' | '{' | '}' => {
442 return i;
443 }
444 '/' => {
445 if bytes[i + 1] as char == '*' {
446 return i;
447 } else {
448 i += 1;
449 }
450 }
451 _ => i += 1,
452 };
453 }
454 i
455}
456
457const fn get_str(ch: char) -> &'static str {
459 match ch {
460 OPEN_SQUARE => "[",
461 CLOSE_SQUARE => "]",
462 OPEN_CURLY => "{",
463 CLOSE_CURLY => "}",
464 COLON => ":",
465 SEMICOLON => ";",
466 CLOSE_PARENTHESES => ")",
467 _ => "",
468 }
469}
470
471const fn get_token_type(ch: char) -> TokenType {
473 match ch {
474 OPEN_SQUARE => TokenType::OpenSquare,
475 CLOSE_SQUARE => TokenType::CloseSquare,
476 OPEN_CURLY => TokenType::OpenCurly,
477 CLOSE_CURLY => TokenType::CloseCurly,
478 COLON => TokenType::Colon,
479 SEMICOLON => TokenType::Semicolon,
480 CLOSE_PARENTHESES => TokenType::CloseParentheses,
481 _ => TokenType::Unknown,
482 }
483}
484
485#[cfg(test)]
486mod test {
487 use super::*;
488
489 #[test]
490 fn test_char_code_at() {
491 let s = "0123456789abc";
492 assert_eq!(char_code_at(s, 0), '0');
493 assert_eq!(char_code_at(s, 1), '1');
494 assert_eq!(char_code_at(s, 100), '\0');
495 }
496}