postcss/
tokenizer.rs

1use crate::ref_ring::RefRing;
2use memchr::memchr;
3use memchr::memmem::Finder;
4use once_cell::sync::Lazy;
5use std::cell::RefCell;
6use std::clone::Clone;
7use std::cmp::PartialEq;
8use std::cmp::{min, Eq};
9
10const SINGLE_QUOTE: char = '\'';
11const DOUBLE_QUOTE: char = '"';
12const BACKSLASH: char = '\\';
13const SLASH: char = '/';
14const NEWLINE: char = '\n';
15const SPACE: char = ' ';
16const FEED: char = '\u{12}'; // \f
17const TAB: char = '\t';
18const CR: char = '\r';
19const OPEN_SQUARE: char = '[';
20const CLOSE_SQUARE: char = ']';
21const OPEN_PARENTHESES: char = '(';
22const CLOSE_PARENTHESES: char = ')';
23const OPEN_CURLY: char = '{';
24const CLOSE_CURLY: char = '}';
25const SEMICOLON: char = ';';
26const ASTERISK: char = '*';
27const COLON: char = ':';
28const AT: char = '@';
29
30const MAX_BUFFER: usize = 102400;
31
32static FINDER_END_OF_COMMENT: Lazy<Finder<'static>> = Lazy::new(|| Finder::new("*/"));
33
34#[derive(Debug, Clone, Eq, PartialEq)]
35pub enum TokenType {
36  OpenParentheses,
37  CloseParentheses,
38  Space,
39  Word,
40  String,
41  OpenSquare,
42  CloseSquare,
43  OpenCurly,
44  CloseCurly,
45  Semicolon,
46  Colon,
47  Comment,
48  AtWord,
49  Brackets,
50  Unknown,
51}
52
53#[derive(Debug, Clone, Eq, PartialEq)]
54pub struct Token<'a>(
55  pub TokenType,
56  pub &'a str,
57  pub Option<usize>,
58  pub Option<usize>,
59);
60
61impl<'a> Token<'a> {
62  pub fn new(kind: TokenType, content: &'a str, pos: Option<usize>, next: Option<usize>) -> Token {
63    Token(kind, content, pos, next)
64  }
65}
66
67#[derive(Debug)]
68pub struct Tokenizer<'a> {
69  css: &'a str,
70  ignore: bool,
71  length: usize,
72  pos: RefCell<usize>,
73  buffer: RefCell<RefRing<'a>>,
74  returned: RefCell<Vec<Token<'a>>>,
75  rope: Option<ropey::Rope>,
76}
77
78impl<'a> Tokenizer<'a> {
79  pub fn new(source_code: &'a str, ignore_errors: bool) -> Tokenizer<'a> {
80    let length = source_code.len();
81    Tokenizer {
82      css: source_code,
83      ignore: ignore_errors,
84      length,
85      pos: RefCell::new(0),
86      buffer: RefCell::new(Default::default()),
87      returned: RefCell::new(Vec::with_capacity(min(MAX_BUFFER, length / 8))),
88      rope: None,
89    }
90  }
91
92  #[inline]
93  fn push(&self, t: &'a str) {
94    self.buffer.borrow_mut().push(t);
95  }
96
97  #[inline]
98  pub fn position(&self) -> usize {
99    *self.pos.borrow()
100  }
101
102  pub fn unclosed(&self, what: &str) {
103    panic!("Unclosed {} {}", what, self.position());
104  }
105
106  pub fn end_of_file(&self) -> bool {
107    self.returned.borrow().is_empty() && self.position() >= self.length
108  }
109
110  pub fn back(&self, token: Token<'a>) {
111    self.returned.borrow_mut().push(token);
112  }
113
114  #[inline]
115  fn pos_plus_one(&self) {
116    self.pos.replace_with(|it| *it + 1);
117  }
118
119  pub fn next_token(&self, ignore_unclosed: bool) -> Token<'a> {
120    if !self.returned.borrow().is_empty() {
121      return self.returned.borrow_mut().pop().unwrap();
122    }
123
124    let mut code = char_code_at(self.css, self.position());
125
126    let current_token: Token;
127
128    match code {
129      NEWLINE | SPACE | TAB | CR | FEED => {
130        let mut next = self.position();
131        loop {
132          next += 1;
133          code = char_code_at(self.css, next);
134          if !(code == SPACE || code == NEWLINE || code == TAB || code == FEED) {
135            break;
136          }
137        }
138
139        current_token = Token(
140          TokenType::Space,
141          self.css[self.position()..next].into(),
142          None,
143          None,
144        );
145
146        self.pos.replace(next);
147      }
148      OPEN_SQUARE | CLOSE_SQUARE | OPEN_CURLY | CLOSE_CURLY | COLON | SEMICOLON
149      | CLOSE_PARENTHESES => {
150        current_token = Token(
151          get_token_type(code),
152          get_str(code),
153          Some(self.position()),
154          None,
155        );
156        self.pos_plus_one();
157      }
158      OPEN_PARENTHESES => {
159        let prev = self.buffer.borrow_mut().pop().unwrap_or("");
160        let n = char_code_at(self.css, self.position() + 1);
161        if prev == "url"
162          && n != SINGLE_QUOTE
163          && n != DOUBLE_QUOTE
164          && n != SPACE
165          && n != NEWLINE
166          && n != TAB
167          && n != FEED
168          && n != CR
169        {
170          let mut next = self.position();
171          loop {
172            let mut escaped = false;
173            match index_of_byte(self.css, b')', next + 1) {
174              Some(i) => {
175                next = i;
176              }
177              None => {
178                if self.ignore || ignore_unclosed {
179                  next = self.position();
180                  break;
181                } else {
182                  self.unclosed("bracket")
183                }
184              }
185            }
186
187            let mut escape_pos = next;
188            while char_code_at(self.css, escape_pos - 1) == BACKSLASH {
189              escape_pos -= 1;
190              escaped = !escaped;
191            }
192
193            if !escaped {
194              break;
195            }
196          }
197
198          current_token = Token(
199            TokenType::Brackets,
200            sub_string(self.css, self.position(), next + 1),
201            Some(self.position()),
202            Some(next),
203          );
204
205          self.pos.replace(next + 1);
206        } else {
207          match index_of_byte(self.css, b')', self.position() + 1) {
208            Some(i) => {
209              let content = &self.css[self.position()..i + 1];
210
211              if is_bad_bracket(content) {
212                current_token = Token(TokenType::OpenParentheses, "(", Some(self.position()), None);
213              } else {
214                current_token = Token(TokenType::Brackets, content, Some(self.position()), Some(i));
215                self.pos.replace(i);
216              }
217            }
218            None => {
219              current_token = Token(TokenType::OpenParentheses, "(", Some(self.position()), None);
220            }
221          };
222          self.pos_plus_one();
223        }
224      }
225      SINGLE_QUOTE | DOUBLE_QUOTE => {
226        let quote = if code == SINGLE_QUOTE { b'\'' } else { b'"' };
227        let mut next = self.position();
228        loop {
229          let mut escaped = false;
230          match index_of_byte(self.css, quote, next + 1) {
231            Some(i) => {
232              next = i;
233            }
234            None => {
235              if self.ignore || ignore_unclosed {
236                next = self.position() + 1;
237                break;
238              } else {
239                self.unclosed("string")
240              }
241            }
242          }
243
244          let mut escape_pos = next;
245          while char_code_at(self.css, escape_pos - 1) == BACKSLASH {
246            escape_pos -= 1;
247            escaped = !escaped;
248          }
249
250          if !escaped {
251            break;
252          }
253        }
254
255        current_token = Token(
256          TokenType::String,
257          sub_string(self.css, self.position(), next + 1),
258          Some(self.position()),
259          Some(next),
260        );
261        self.pos.replace(next + 1);
262      }
263      AT => {
264        let next = index_of_at_end(self.css, self.position() + 1) - 1;
265        current_token = Token(
266          TokenType::AtWord,
267          sub_string(self.css, self.position(), next + 1),
268          Some(self.position()),
269          Some(next),
270        );
271        self.pos.replace(next + 1);
272      }
273      BACKSLASH => {
274        let mut next = self.position();
275        let mut escape = true;
276        while char_code_at(self.css, next + 1) == BACKSLASH {
277          next += 1;
278          escape = !escape;
279        }
280        code = char_code_at(self.css, next + 1);
281        if escape
282          && code != SLASH
283          && code != SPACE
284          && code != NEWLINE
285          && code != TAB
286          && code != CR
287          && code != FEED
288        {
289          next += 1;
290          if is_hex_char(self.css, next) {
291            while is_hex_char(self.css, next + 1) {
292              next += 1;
293            }
294            if char_code_at(self.css, next + 1) == SPACE {
295              next += 1;
296            }
297          }
298        }
299
300        current_token = Token(
301          TokenType::Word,
302          sub_string(self.css, self.position(), next + 1),
303          Some(self.position()),
304          Some(next),
305        );
306        self.pos.replace(next + 1);
307      }
308      _ => {
309        self.pos.replace(
310          if code == SLASH && char_code_at(self.css, self.position() + 1) == ASTERISK {
311            let next = match index_of_end_comment(self.css, self.position() + 2) {
312              Some(i) => i + 1,
313              None => {
314                if !self.ignore && !ignore_unclosed {
315                  self.unclosed("comment");
316                }
317                self.length
318              }
319            };
320
321            current_token = Token(
322              TokenType::Comment,
323              sub_string(self.css, self.position(), next + 1),
324              Some(self.position()),
325              Some(next),
326            );
327            next
328          } else {
329            let next = index_of_word_end(self.css, self.position() + 1) - 1;
330            let content = sub_string(self.css, self.position(), next + 1);
331            current_token = Token::new(TokenType::Word, content, Some(self.position()), Some(next));
332            self.push(content);
333            next
334          },
335        );
336        self.pos_plus_one();
337      }
338    }
339
340    current_token
341  }
342
343  /// return (line, column), use rope for simplicity
344  pub fn from_offset(&mut self, offset: usize) -> (usize, usize) {
345    let rope = if let Some(ref rope) = self.rope {
346      rope
347    } else {
348      self.rope = Some(ropey::Rope::from_str(self.css));
349      &self.rope.as_ref().unwrap()
350    };
351    let column = rope.byte_to_char(offset);
352    let line = rope.byte_to_line(offset);
353    (line, column)
354  }
355}
356
357#[inline]
358fn index_of_end_comment(value: &str, from_index: usize) -> Option<usize> {
359  let (_, last) = value.split_at(from_index);
360  FINDER_END_OF_COMMENT
361    .find(last.as_bytes())
362    .map(|v| v + from_index)
363}
364
365#[inline]
366fn index_of_byte(value: &str, search_value: u8, from_index: usize) -> Option<usize> {
367  let (_, last) = value.split_at(from_index);
368  memchr(search_value, last.as_bytes()).map(|v| v + from_index)
369}
370
371#[inline]
372fn sub_string(s: &str, start: usize, end: usize) -> &str {
373  if end + 1 > s.len() {
374    &s[start..]
375  } else {
376    &s[start..end]
377  }
378}
379
380#[inline]
381fn char_code_at(s: &str, n: usize) -> char {
382  if n >= s.len() {
383    '\0'
384  } else {
385    s.as_bytes()[n] as char
386  }
387}
388
389#[inline]
390fn is_hex_char(s: &str, n: usize) -> bool {
391  if n >= s.len() {
392    return false;
393  }
394
395  matches!(s.as_bytes()[n], b'A'..=b'F' | b'a'..=b'f' | b'0'..=b'9')
396}
397
398#[inline]
399fn is_bad_bracket(s: &str) -> bool {
400  let bytes = s.as_bytes();
401  #[allow(clippy::needless_range_loop)]
402  for i in 1..bytes.len() {
403    match bytes[i] as char {
404      '\n' | '"' | '\'' | '(' | '/' | '\\' => {
405        return true;
406      }
407      _ => continue,
408    };
409  }
410  false
411}
412
413#[inline]
414fn index_of_at_end(s: &str, start: usize) -> usize {
415  let bytes = s.as_bytes();
416  let mut i = start;
417  let len = bytes.len();
418
419  while i < len {
420    match bytes[i] as char {
421      '\t' | '\n' | '\u{12}' | '\r' | ' ' | '"' | '#' | '\'' | '(' | ')' | '/' | ';' | '['
422      | '\\' | ']' | '{' | '}' => {
423        return i;
424      }
425      _ => i += 1,
426    };
427  }
428
429  i
430}
431
432#[inline]
433fn index_of_word_end(s: &str, start: usize) -> usize {
434  let bytes = s.as_bytes();
435  let mut i = start;
436  let len = bytes.len();
437
438  while i < len {
439    match bytes[i] as char {
440      '\t' | '\n' | '\u{12}' | '\r' | ' ' | '!' | '"' | '#' | '\'' | '(' | ')' | ':' | ';'
441      | '@' | '[' | '\\' | ']' | '{' | '}' => {
442        return i;
443      }
444      '/' => {
445        if bytes[i + 1] as char == '*' {
446          return i;
447        } else {
448          i += 1;
449        }
450      }
451      _ => i += 1,
452    };
453  }
454  i
455}
456
457/// SAFETY: YOU SHOULD NEVER CALL THIS FUNCTION WITH THE PARAM OTHER THAN THESE BELOW.
458const fn get_str(ch: char) -> &'static str {
459  match ch {
460    OPEN_SQUARE => "[",
461    CLOSE_SQUARE => "]",
462    OPEN_CURLY => "{",
463    CLOSE_CURLY => "}",
464    COLON => ":",
465    SEMICOLON => ";",
466    CLOSE_PARENTHESES => ")",
467    _ => "",
468  }
469}
470
471/// SAFETY: YOU SHOULD NEVER CALL THIS FUNCTION WITH THE PARAM OTHER THAN THESE BELOW.
472const fn get_token_type(ch: char) -> TokenType {
473  match ch {
474    OPEN_SQUARE => TokenType::OpenSquare,
475    CLOSE_SQUARE => TokenType::CloseSquare,
476    OPEN_CURLY => TokenType::OpenCurly,
477    CLOSE_CURLY => TokenType::CloseCurly,
478    COLON => TokenType::Colon,
479    SEMICOLON => TokenType::Semicolon,
480    CLOSE_PARENTHESES => TokenType::CloseParentheses,
481    _ => TokenType::Unknown,
482  }
483}
484
485#[cfg(test)]
486mod test {
487  use super::*;
488
489  #[test]
490  fn test_char_code_at() {
491    let s = "0123456789abc";
492    assert_eq!(char_code_at(s, 0), '0');
493    assert_eq!(char_code_at(s, 1), '1');
494    assert_eq!(char_code_at(s, 100), '\0');
495  }
496}