s_expr/
tokenizer.rs

1use super::data::*;
2use super::loc::{Position, Span, Spanned};
3use super::utf8::{next_char, MovementInBytes, NextCharError};
4
5#[cfg(feature = "unicode")]
6use unicode_xid::UnicodeXID;
7
8/// Config for the tokenizer, for flags
9#[derive(Debug, Clone)]
10pub struct TokenizerConfig {
11    /// Tokenize the comment
12    filter_comment: bool,
13    /// Add support for the bytes token, which is of the format `#<hexadecimal>#`. Default is set to true
14    support_bytes: bool,
15    /// Add support for the { } group, Default is set to true
16    support_brace: bool,
17    /// Add support for the [ ] group, Default is set to true
18    support_bracket: bool,
19}
20
21impl Default for TokenizerConfig {
22    fn default() -> Self {
23        TokenizerConfig {
24            filter_comment: false,
25            support_bytes: true,
26            support_bracket: true,
27            support_brace: true,
28        }
29    }
30}
31
32impl TokenizerConfig {
33    /// Support comment in the output of the tokenizer or filter them away
34    pub fn comment(mut self, enabled: bool) -> Self {
35        self.filter_comment = !enabled;
36        self
37    }
38
39    /// Support braces group in the output of the tokenizer or filter them away
40    pub fn braces(mut self, enabled: bool) -> Self {
41        self.support_brace = enabled;
42        self
43    }
44
45    /// Support bracket group in the output of the tokenizer or filter them away
46    pub fn bracket(mut self, enabled: bool) -> Self {
47        self.support_bracket = enabled;
48        self
49    }
50
51    /// Support the bytes atom in the output of the tokenizer
52    pub fn support_bytes(mut self, supported: bool) -> Self {
53        self.support_bytes = supported;
54        self
55    }
56}
57
58/// Tokenizer state on the data
59pub struct Tokenizer<'a> {
60    data: &'a [u8],
61    index: TokDataPos,
62    position: Position,
63    cfg: TokenizerConfig,
64}
65
66#[derive(Clone, Copy)]
67pub struct TokDataPos(usize);
68
69/// Tokens
70#[derive(Clone, Debug)]
71pub enum Token<'a> {
72    /// Left group
73    Left(GroupKind),
74    /// Right group
75    Right(GroupKind),
76    /// Comment starting with ';'
77    Comment(&'a str),
78    /// Atom
79    Atom(Atom<'a>),
80}
81
82impl<'a> Token<'a> {
83    pub fn is_comment(&self) -> bool {
84        match self {
85            Token::Comment(_) => true,
86            _ => false,
87        }
88    }
89}
90
91/// A Token with the span (start and end positions) associated
92pub type SpannedToken<'a> = Spanned<Token<'a>>;
93/*
94#[derive(Clone, Debug)]
95pub struct SpannedToken<'a> {
96    pub span: Span,
97    pub token: Token<'a>,
98}
99*/
100
101#[derive(Clone, Debug)]
102pub enum TokenError {
103    DataError(NextCharError, usize),
104    UnterminatedString(Position),
105    UnterminatedBytes(Position),
106    UnprocessedChar(char),
107    UnterminatedBytesChar(Position, char),
108}
109
110impl<'a> Tokenizer<'a> {
111    /// Create a new tokenizer from the data stream
112    pub fn new(data: &'a str) -> Self {
113        Tokenizer {
114            data: data.as_bytes(),
115            index: TokDataPos(0),
116            position: Position::default(),
117            cfg: TokenizerConfig::default(),
118        }
119    }
120
121    /// Create a new tokenizer from the data stream with an associated config
122    pub fn new_with_config(data: &'a str, cfg: TokenizerConfig) -> Self {
123        Tokenizer {
124            data: data.as_bytes(),
125            index: TokDataPos(0),
126            position: Position::default(),
127            cfg,
128        }
129    }
130
131    /// Return the next token, or none if reach the end of stream
132    pub fn next(&mut self) -> Result<Option<SpannedToken<'a>>, TokenError> {
133        // note that the tokenizer only take `str` type, so that the content is always invalid,
134        // short of an internal error, so all the .expect should not never trigger except on a
135        // internal bug.
136        loop {
137            self.skip_whitespace().expect("Valid string");
138            match self.peek_char().expect("Valid string") {
139                None => return Ok(None),
140                Some((leading_char, advance)) => {
141                    let token_start = self.position;
142                    let position_start = self.index;
143                    self.position.advance(leading_char);
144                    self.move_index(advance);
145                    let tok = self.next_cont(token_start, position_start, leading_char)?;
146                    // if it's a comment, and we filter comment, we don't return
147                    if !tok.inner.is_comment() {
148                        return Ok(Some(tok));
149                    } else {
150                        if !self.cfg.filter_comment {
151                            return Ok(Some(tok));
152                        }
153                    }
154                }
155            }
156        }
157    }
158
159    fn move_index(&mut self, bytes: MovementInBytes) {
160        self.index.0 += bytes.0
161    }
162
163    fn slice_from(&self, start: TokDataPos) -> &'a str {
164        let slice = &self.data[start.0..self.index.0];
165        core::str::from_utf8(slice).expect("valid utf8")
166    }
167
168    fn peek_char(&self) -> Result<Option<(char, MovementInBytes)>, TokenError> {
169        match next_char(self.data, self.index.0) {
170            Err(e) => Err(TokenError::DataError(e, self.index.0)),
171            Ok(ok) => Ok(ok),
172        }
173    }
174
175    fn skip_whitespace(&mut self) -> Result<(), TokenError> {
176        loop {
177            match self.peek_char()? {
178                None => return Ok(()),
179                Some((ch, advance)) => {
180                    if !"\n\t ".contains(ch) {
181                        return Ok(());
182                    }
183                    self.position.advance(ch);
184                    self.move_index(advance);
185                }
186            }
187        }
188    }
189
190    /// advance the data stream until the function F return true
191    fn skip_until<F>(&mut self, f: F) -> Result<(), TokenError>
192    where
193        F: Fn(char) -> bool,
194    {
195        loop {
196            match self.peek_char()? {
197                None => return Ok(()),
198                Some((ch, advance)) => {
199                    if f(ch) {
200                        return Ok(());
201                    }
202                    self.position.advance(ch);
203                    self.move_index(advance);
204                }
205            }
206        }
207    }
208
209    /// advance the data stream while the function F return true
210    fn skip_while<F>(&mut self, f: F) -> Result<(), TokenError>
211    where
212        F: Fn(char) -> bool,
213    {
214        loop {
215            match self.peek_char()? {
216                None => return Ok(()),
217                Some((ch, advance)) => {
218                    if !f(ch) {
219                        return Ok(());
220                    }
221                    self.position.advance(ch);
222                    self.move_index(advance);
223                }
224            }
225        }
226    }
227
228    fn bytes(&mut self) -> Result<ABytes<'a>, TokenError> {
229        let position_start = self.index;
230        self.skip_while(|c| c.is_ascii_hexdigit())?;
231        match self.peek_char()? {
232            None => Err(TokenError::UnterminatedBytes(self.position)),
233            Some((ch, advance)) => {
234                if ch == '#' {
235                    let dat = self.slice_from(position_start);
236
237                    // consume the "
238                    self.position.advance(ch);
239                    self.move_index(advance);
240
241                    return Ok(ABytes(dat));
242                } else {
243                    return Err(TokenError::UnterminatedBytesChar(self.position, ch));
244                }
245            }
246        }
247    }
248
249    fn number(
250        &mut self,
251        leading_char: char,
252        position_start: TokDataPos,
253    ) -> Result<ANum<'a>, TokenError> {
254        match self.peek_char()? {
255            None => {
256                // if we reach the end of stream, just take the current buffer and raise the event
257                let dat = self.slice_from(position_start);
258                Ok(ANum {
259                    base: ANumBase::Decimal,
260                    dat: dat,
261                })
262            }
263            Some((ch, advance)) => {
264                let zero_start = leading_char == '0';
265
266                if zero_start {
267                    if ch == 'b' {
268                        // binary string, eat the 'b', and save the initial position
269                        self.position.advance(ch);
270                        self.move_index(advance);
271
272                        let position_start = self.index;
273
274                        self.skip_while(|c| c == '0' || c == '1' || c == '_')?;
275                        Ok(ANum {
276                            base: ANumBase::Binary,
277                            dat: self.slice_from(position_start),
278                        })
279                    } else if ch == 'x' {
280                        // hexadecimal string, eat the 'x', and save the initial position
281                        self.position.advance(ch);
282                        self.move_index(advance);
283
284                        let position_start = self.index;
285
286                        self.skip_while(|c| c.is_ascii_hexdigit() || c == '_')?;
287                        Ok(ANum {
288                            base: ANumBase::Hexadecimal,
289                            dat: self.slice_from(position_start),
290                        })
291                    } else if ch.is_ascii_digit() {
292                        self.position.advance(ch);
293                        self.move_index(advance);
294
295                        self.skip_while(|c| c.is_numeric() || c == '_')?;
296                        Ok(ANum {
297                            base: ANumBase::Decimal,
298                            dat: self.slice_from(position_start),
299                        })
300                    } else {
301                        let dat = self.slice_from(position_start);
302                        Ok(ANum {
303                            base: ANumBase::Decimal,
304                            dat: dat,
305                        })
306                    }
307                } else {
308                    if ch.is_ascii_digit() {
309                        self.position.advance(ch);
310                        self.move_index(advance);
311
312                        self.skip_while(|c| c.is_numeric() || c == '_')?;
313                        Ok(ANum {
314                            base: ANumBase::Decimal,
315                            dat: self.slice_from(position_start),
316                        })
317                    } else {
318                        let dat = self.slice_from(position_start);
319                        Ok(ANum {
320                            base: ANumBase::Decimal,
321                            dat: dat,
322                        })
323                    }
324                }
325            }
326        }
327    }
328
329    // consume the data
330    fn string(&mut self) -> Result<AStr<'a>, TokenError> {
331        let mut has_escape = false; // check if there's any escape in the data
332        let position_start = self.index;
333
334        let mut escape = false;
335        loop {
336            match self.peek_char()? {
337                None => return Err(TokenError::UnterminatedString(self.position)),
338                Some((ch, advance)) => {
339                    if escape {
340                        escape = false;
341                    } else {
342                        if ch == '\\' {
343                            has_escape = true;
344                            escape = true;
345                        } else if ch == '"' {
346                            let dat = self.slice_from(position_start);
347
348                            // consume the "
349                            self.position.advance(ch);
350                            self.move_index(advance);
351
352                            return Ok(AStr {
353                                has_escape,
354                                raw_data: dat,
355                            });
356                        }
357                    }
358                    self.position.advance(ch);
359                    self.move_index(advance);
360                }
361            }
362        }
363    }
364
365    // this method has to parse a token (or return an error)
366    fn next_cont(
367        &mut self,
368        token_start: Position,
369        position_start: TokDataPos,
370        leading_char: char,
371    ) -> Result<SpannedToken<'a>, TokenError> {
372        let stok = |cur, token| {
373            let span = Span {
374                start: token_start,
375                end: cur,
376            };
377            Ok(Spanned { span, inner: token })
378        };
379
380        // lex in this order:
381        // * group characters: '(' ')' '[' ']' '{' '}'
382        // * line comment: ';'
383        // * string : '"'
384        // * (optionally) bytes : '#'
385        // * number : '0'..'9'
386        // * identifier : anything else
387
388        if leading_char == '(' {
389            stok(self.position, Token::Left(GroupKind::Paren))
390        } else if leading_char == ')' {
391            stok(self.position, Token::Right(GroupKind::Paren))
392        } else if self.cfg.support_bracket && leading_char == '[' {
393            stok(self.position, Token::Left(GroupKind::Bracket))
394        } else if self.cfg.support_bracket && leading_char == ']' {
395            stok(self.position, Token::Right(GroupKind::Bracket))
396        } else if self.cfg.support_brace && leading_char == '{' {
397            stok(self.position, Token::Left(GroupKind::Brace))
398        } else if self.cfg.support_brace && leading_char == '}' {
399            stok(self.position, Token::Right(GroupKind::Brace))
400        } else if leading_char == ';' {
401            // comment
402            self.skip_until(|c| c == '\n')?;
403            let comment = self.slice_from(position_start);
404            stok(self.position, Token::Comment(comment))
405        } else if leading_char == '"' {
406            // string
407            let astr = self.string()?;
408            stok(self.position, Token::Atom(Atom::String(astr)))
409        } else if self.cfg.support_bytes && leading_char == '#' {
410            // byte stream
411            let bstr = self.bytes()?;
412            stok(self.position, Token::Atom(Atom::Bytes(bstr)))
413        } else if leading_char.is_ascii_digit() {
414            // number
415            let anum = self.number(leading_char, position_start)?;
416            let is_decimal = anum.base == ANumBase::Decimal;
417            // if this is a decimal number, then we check if it's followed by a '.', in this case it's a decimal type
418            if is_decimal {
419                match self.peek_char() {
420                    Ok(Some((ch @ '.', dot_advance))) => {
421                        self.position.advance(ch);
422                        self.move_index(dot_advance);
423
424                        // might parse no decimal part, but we accept it `1.` will be equivalent to `1.0`
425                        let fractional_start = self.index;
426                        self.skip_while(|c| c.is_ascii_digit())?;
427                        let raw_fractional = self.slice_from(fractional_start);
428
429                        let adec = ADecimal {
430                            raw_integral: anum.dat,
431                            raw_fractional,
432                        };
433                        stok(self.position, Token::Atom(Atom::Decimal(adec)))
434                    }
435                    _ => stok(self.position, Token::Atom(Atom::Integral(anum))),
436                }
437            } else {
438                stok(self.position, Token::Atom(Atom::Integral(anum)))
439            }
440        } else if is_id_start(leading_char) {
441            self.skip_while(|c| is_id_continue(c))?;
442            let ident = self.slice_from(position_start);
443            stok(self.position, Token::Atom(Atom::Ident(ident)))
444        } else {
445            Err(TokenError::UnprocessedChar(leading_char))
446        }
447    }
448}
449
450fn is_id_start(ch: char) -> bool {
451    #[cfg(feature = "unicode")]
452    {
453        ch.is_xid_start()
454            || ch == '_'
455            || is_ascii_operator(ch)
456            || crate::utf8::extended_math_operator(ch)
457    }
458    #[cfg(not(feature = "unicode"))]
459    {
460        ch.is_ascii_alphabetic() || ch == '_' || is_ascii_operator(ch)
461    }
462}
463
464fn is_id_continue(ch: char) -> bool {
465    #[cfg(feature = "unicode")]
466    {
467        ch.is_xid_continue()
468            || ch == '_'
469            || ch.is_ascii_digit()
470            || is_ascii_operator(ch)
471            || crate::utf8::extended_math_operator(ch)
472    }
473    #[cfg(not(feature = "unicode"))]
474    {
475        ch.is_ascii_alphabetic() || ch == '_' || ch.is_ascii_digit() || is_ascii_operator(ch)
476    }
477}
478
479fn is_ascii_operator(ch: char) -> bool {
480    // any ascii operator except: [] {} () " ; \\
481    "?!#@$+-*/=<>,.:|%^&~'`".contains(ch)
482}