toml_spanner/
tokens.rs

1#![allow(missing_docs)]
2//! The tokenizer is publicly exposed if you wish to use it instead
3
4use crate::{Span, value::Key};
5use std::{borrow::Cow, char, str};
6
7#[derive(Eq, PartialEq, Debug)]
8pub enum Token<'a> {
9    Whitespace(&'a str),
10    Newline,
11    Comment(&'a str),
12
13    Equals,
14    Period,
15    Comma,
16    Colon,
17    Plus,
18    LeftBrace,
19    RightBrace,
20    LeftBracket,
21    RightBracket,
22
23    Keylike(&'a str),
24    String {
25        src: &'a str,
26        val: Cow<'a, str>,
27        multiline: bool,
28    },
29}
30
31#[derive(Eq, PartialEq, Debug)]
32pub enum Error {
33    InvalidCharInString(usize, char),
34    InvalidEscape(usize, char),
35    InvalidHexEscape(usize, char),
36    InvalidEscapeValue(usize, usize, u32),
37    NewlineInString(usize),
38    Unexpected(usize, char),
39    UnterminatedString(usize),
40    MultilineStringKey(usize, usize),
41    Wanted {
42        at: usize,
43        expected: &'static str,
44        found: &'static str,
45    },
46}
47
48#[derive(Clone)]
49pub struct Tokenizer<'a> {
50    input: &'a str,
51    chars: CrlfFold<'a>,
52}
53
54#[derive(Clone)]
55struct CrlfFold<'a> {
56    chars: str::CharIndices<'a>,
57}
58
59#[derive(Debug)]
60enum MaybeString {
61    NotEscaped(usize),
62    Owned(String),
63}
64
65impl<'a> Tokenizer<'a> {
66    pub fn new(input: &'a str) -> Tokenizer<'a> {
67        let mut t = Tokenizer {
68            input,
69            chars: CrlfFold {
70                chars: input.char_indices(),
71            },
72        };
73        // Eat utf-8 BOM
74        t.eatc('\u{feff}');
75        t
76    }
77
78    pub fn step(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
79        let (start, token) = match self.one() {
80            Some((start, '\n')) => (start, Token::Newline),
81            Some((start, ' ' | '\t')) => (start, self.whitespace_token(start)),
82            Some((start, '#')) => (start, self.comment_token(start)),
83            Some((start, '=')) => (start, Token::Equals),
84            Some((start, '.')) => (start, Token::Period),
85            Some((start, ',')) => (start, Token::Comma),
86            Some((start, ':')) => (start, Token::Colon),
87            Some((start, '+')) => (start, Token::Plus),
88            Some((start, '{')) => (start, Token::LeftBrace),
89            Some((start, '}')) => (start, Token::RightBrace),
90            Some((start, '[')) => (start, Token::LeftBracket),
91            Some((start, ']')) => (start, Token::RightBracket),
92            Some((start, '\'')) => return self.literal_string(start).map(|(s, t)| Some((s, t))),
93            Some((start, '"')) => return self.basic_string(start).map(|(s, t)| Some((s, t))),
94            Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
95            Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
96            None => return Ok(None),
97        };
98
99        let span = self.step_span(start);
100        Ok(Some((span, token)))
101    }
102
103    pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
104        self.clone().step()
105    }
106
107    pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
108        self.eat_spanned(expected).map(|s| s.is_some())
109    }
110
111    /// Eat a value, returning it's span if it was consumed.
112    pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
113        let span = match self.peek()? {
114            Some((span, ref found)) if expected == *found => span,
115            Some(_) | None => return Ok(None),
116        };
117
118        drop(self.step());
119        Ok(Some(span))
120    }
121
122    pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
123        // ignore span
124        let _ = self.expect_spanned(expected)?;
125        Ok(())
126    }
127
128    /// Expect the given token returning its span.
129    pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
130        let current = self.current();
131        match self.step()? {
132            Some((span, found)) => {
133                if expected == found {
134                    Ok(span)
135                } else {
136                    Err(Error::Wanted {
137                        at: current,
138                        expected: expected.describe(),
139                        found: found.describe(),
140                    })
141                }
142            }
143            None => Err(Error::Wanted {
144                at: self.input.len(),
145                expected: expected.describe(),
146                found: "eof",
147            }),
148        }
149    }
150
151    pub fn table_key(&mut self) -> Result<Key<'a>, Error> {
152        let current = self.current();
153        match self.step()? {
154            Some((span, Token::Keylike(k))) => Ok(Key {
155                span,
156                name: k.into(),
157            }),
158            Some((
159                span,
160                Token::String {
161                    src,
162                    val,
163                    multiline,
164                    ..
165                },
166            )) => {
167                let offset = self.substr_offset(src);
168                if multiline {
169                    return Err(Error::MultilineStringKey(offset, offset + val.len()));
170                }
171                match src.find('\n') {
172                    None => Ok(Key { span, name: val }),
173                    // This is not reachable
174                    Some(i) => Err(Error::InvalidCharInString(i, '\n')),
175                }
176            }
177            Some((_, other)) => Err(Error::Wanted {
178                at: current,
179                expected: "a table key",
180                found: other.describe(),
181            }),
182            None => Err(Error::Wanted {
183                at: self.input.len(),
184                expected: "a table key",
185                found: "eof",
186            }),
187        }
188    }
189
190    pub fn eat_whitespace(&mut self) {
191        while self.eatc(' ') || self.eatc('\t') {
192            // ...
193        }
194    }
195
196    pub fn eat_comment(&mut self) -> Result<bool, Error> {
197        if !self.eatc('#') {
198            return Ok(false);
199        }
200        drop(self.comment_token(0));
201        self.eat_newline_or_eof().map(|()| true)
202    }
203
204    pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
205        let current = self.current();
206        match self.step()? {
207            None | Some((_, Token::Newline)) => Ok(()),
208            Some((_, other)) => Err(Error::Wanted {
209                at: current,
210                expected: "newline",
211                found: other.describe(),
212            }),
213        }
214    }
215
216    pub fn skip_to_newline(&mut self) {
217        loop {
218            match self.one() {
219                Some((_, '\n')) | None => break,
220                _ => {}
221            }
222        }
223    }
224
225    fn eatc(&mut self, ch: char) -> bool {
226        match self.chars.clone().next() {
227            Some((_, ch2)) if ch == ch2 => {
228                self.one();
229                true
230            }
231            _ => false,
232        }
233    }
234
235    pub fn current(&mut self) -> usize {
236        match self.chars.clone().next() {
237            Some(i) => i.0,
238            None => self.input.len(),
239        }
240    }
241
242    fn whitespace_token(&mut self, start: usize) -> Token<'a> {
243        while self.eatc(' ') || self.eatc('\t') {
244            // ...
245        }
246        Token::Whitespace(&self.input[start..self.current()])
247    }
248
249    fn comment_token(&mut self, start: usize) -> Token<'a> {
250        while let Some((_, ch)) = self.chars.clone().next() {
251            if ch != '\t' && !('\u{20}'..='\u{10ffff}').contains(&ch) {
252                break;
253            }
254            self.one();
255        }
256        Token::Comment(&self.input[start..self.current()])
257    }
258
259    /// String spans are treated slightly differently, as we only want the
260    /// characters in the string, not the quotes, as once the user gets the
261    /// string and its span they won't know the actual begin/end which can
262    /// be needed for doing substring indices (eg reporting error messages
263    /// when parsing a string)
264    #[allow(clippy::type_complexity)]
265    fn read_string(
266        &mut self,
267        delim: char,
268        start: usize,
269        new_ch: &mut dyn FnMut(
270            &mut Tokenizer<'_>,
271            &mut MaybeString,
272            bool,
273            usize,
274            char,
275        ) -> Result<(), Error>,
276    ) -> Result<(Span, Token<'a>), Error> {
277        let mut multiline = false;
278        if self.eatc(delim) {
279            if self.eatc(delim) {
280                multiline = true;
281            } else {
282                return Ok((
283                    // Point the caret at the beginning of the quote, that looks
284                    // better than the end quote
285                    Span::new(start as u32, (start + 1) as u32),
286                    Token::String {
287                        src: &self.input[start..start + 2],
288                        val: Cow::Borrowed(""),
289                        multiline: false,
290                    },
291                ));
292            }
293        }
294        let mut val = MaybeString::NotEscaped(self.current());
295        let mut n = 0;
296        loop {
297            n += 1;
298            match self.one() {
299                Some((i, '\n')) => {
300                    if multiline {
301                        if self.input.as_bytes()[i] == b'\r' {
302                            val.make_owned(&self.input[..i]);
303                        }
304                        if n == 1 {
305                            val = MaybeString::NotEscaped(self.current());
306                        } else {
307                            val.push('\n');
308                        }
309                    } else {
310                        return Err(Error::NewlineInString(i));
311                    }
312                }
313                Some((mut i, ch)) if ch == delim => {
314                    let span = if multiline {
315                        if !self.eatc(delim) {
316                            val.push(delim);
317                            continue;
318                        }
319                        if !self.eatc(delim) {
320                            val.push(delim);
321                            val.push(delim);
322                            continue;
323                        }
324                        if self.eatc(delim) {
325                            val.push(delim);
326                            i += 1;
327                        }
328                        if self.eatc(delim) {
329                            val.push(delim);
330                            i += 1;
331                        }
332
333                        // Also skip the first newline after the opening delimiters
334                        let maybe_nl = self.input.as_bytes()[start + 3];
335                        let start_off = if maybe_nl == b'\n' {
336                            4
337                        } else if maybe_nl == b'\r' {
338                            5
339                        } else {
340                            3
341                        };
342
343                        Span::new((start + start_off) as u32, (self.current() - 3) as u32)
344                    } else {
345                        Span::new((start + 1) as u32, (self.current() - 1) as u32)
346                    };
347
348                    return Ok((
349                        span,
350                        Token::String {
351                            src: &self.input[start..self.current()],
352                            val: val.into_cow(&self.input[..i]),
353                            multiline,
354                        },
355                    ));
356                }
357                Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
358                None => return Err(Error::UnterminatedString(start)),
359            }
360        }
361    }
362
363    fn literal_string(&mut self, start: usize) -> Result<(Span, Token<'a>), Error> {
364        self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
365            if ch == '\u{09}' || (ch != '\u{7f}' && ('\u{20}'..='\u{10ffff}').contains(&ch)) {
366                val.push(ch);
367                Ok(())
368            } else {
369                Err(Error::InvalidCharInString(i, ch))
370            }
371        })
372    }
373
374    fn basic_string(&mut self, start: usize) -> Result<(Span, Token<'a>), Error> {
375        self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
376            '\\' => {
377                val.make_owned(&me.input[..i]);
378                match me.chars.next() {
379                    Some((_, '"')) => val.push('"'),
380                    Some((_, '\\')) => val.push('\\'),
381                    Some((_, 'b')) => val.push('\u{8}'),
382                    Some((_, 'f')) => val.push('\u{c}'),
383                    Some((_, 'n')) => val.push('\n'),
384                    Some((_, 'r')) => val.push('\r'),
385                    Some((_, 't')) => val.push('\t'),
386                    Some((i, c @ ('u' | 'U'))) => {
387                        let c = if c == 'u' {
388                            me.hex::<4>(start, i)
389                        } else {
390                            me.hex::<8>(start, i)
391                        };
392                        val.push(c?);
393                    }
394                    Some((i, c @ (' ' | '\t' | '\n'))) if multi => {
395                        if c != '\n' {
396                            while let Some((_, ch)) = me.chars.clone().next() {
397                                match ch {
398                                    ' ' | '\t' => {
399                                        me.chars.next();
400                                    }
401                                    '\n' => {
402                                        me.chars.next();
403                                        break;
404                                    }
405                                    _ => return Err(Error::InvalidEscape(i, c)),
406                                }
407                            }
408                        }
409                        while let Some((_, ch)) = me.chars.clone().next() {
410                            match ch {
411                                ' ' | '\t' | '\n' => {
412                                    me.chars.next();
413                                }
414                                _ => break,
415                            }
416                        }
417                    }
418                    Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
419                    None => return Err(Error::UnterminatedString(start)),
420                }
421                Ok(())
422            }
423            ch if ch == '\u{09}' || (ch != '\u{7f}' && ('\u{20}'..='\u{10ffff}').contains(&ch)) => {
424                val.push(ch);
425                Ok(())
426            }
427            _ => Err(Error::InvalidCharInString(i, ch)),
428        })
429    }
430
431    fn hex<const N: usize>(&mut self, start: usize, i: usize) -> Result<char, Error> {
432        let mut buf = [0; N];
433        for b in buf.iter_mut() {
434            match self.one() {
435                Some((_, ch)) if ch as u32 <= 0x7F && ch.is_ascii_hexdigit() => *b = ch as u8,
436                Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
437                None => return Err(Error::UnterminatedString(start)),
438            }
439        }
440        let val = u32::from_str_radix(std::str::from_utf8(&buf).unwrap(), 16).unwrap();
441        match char::from_u32(val) {
442            Some(ch) => Ok(ch),
443            None => Err(Error::InvalidEscapeValue(i, N, val)),
444        }
445    }
446
447    fn keylike(&mut self, start: usize) -> Token<'a> {
448        while let Some((_, ch)) = self.peek_one() {
449            if !is_keylike(ch) {
450                break;
451            }
452            self.one();
453        }
454        Token::Keylike(&self.input[start..self.current()])
455    }
456
457    pub fn substr_offset(&self, s: &'a str) -> usize {
458        assert!(s.len() <= self.input.len());
459        let a = self.input.as_ptr() as usize;
460        let b = s.as_ptr() as usize;
461        assert!(a <= b);
462        b - a
463    }
464
465    /// Calculate the span of a single character.
466    fn step_span(&mut self, start: usize) -> Span {
467        let end = match self.peek_one() {
468            Some(t) => t.0,
469            None => self.input.len(),
470        };
471        Span {
472            start: start as u32,
473            end: end as u32,
474        }
475    }
476
477    /// Peek one char without consuming it.
478    fn peek_one(&mut self) -> Option<(usize, char)> {
479        self.chars.clone().next()
480    }
481
482    /// Take one char.
483    pub fn one(&mut self) -> Option<(usize, char)> {
484        self.chars.next()
485    }
486}
487
488impl Iterator for CrlfFold<'_> {
489    type Item = (usize, char);
490
491    fn next(&mut self) -> Option<(usize, char)> {
492        self.chars.next().map(|(i, c)| {
493            if c == '\r' {
494                let mut attempt = self.chars.clone();
495                if let Some((_, '\n')) = attempt.next() {
496                    self.chars = attempt;
497                    return (i, '\n');
498                }
499            }
500            (i, c)
501        })
502    }
503}
504
505impl MaybeString {
506    fn push(&mut self, ch: char) {
507        match *self {
508            MaybeString::NotEscaped(..) => {}
509            MaybeString::Owned(ref mut s) => s.push(ch),
510        }
511    }
512
513    fn make_owned(&mut self, input: &str) {
514        match *self {
515            MaybeString::NotEscaped(start) => {
516                *self = MaybeString::Owned(input[start..].to_owned());
517            }
518            MaybeString::Owned(..) => {}
519        }
520    }
521
522    fn into_cow(self, input: &str) -> Cow<'_, str> {
523        match self {
524            MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
525            MaybeString::Owned(s) => Cow::Owned(s),
526        }
527    }
528}
529
530#[inline]
531fn is_keylike(ch: char) -> bool {
532    ch.is_ascii_alphanumeric() || ch == '-' || ch == '_'
533}
534
535impl Token<'_> {
536    pub fn describe(&self) -> &'static str {
537        match *self {
538            Token::Keylike(_) => "an identifier",
539            Token::Equals => "an equals",
540            Token::Period => "a period",
541            Token::Comment(_) => "a comment",
542            Token::Newline => "a newline",
543            Token::Whitespace(_) => "whitespace",
544            Token::Comma => "a comma",
545            Token::RightBrace => "a right brace",
546            Token::LeftBrace => "a left brace",
547            Token::RightBracket => "a right bracket",
548            Token::LeftBracket => "a left bracket",
549            Token::String { multiline, .. } => {
550                if multiline {
551                    "a multiline string"
552                } else {
553                    "a string"
554                }
555            }
556            Token::Colon => "a colon",
557            Token::Plus => "a plus",
558        }
559    }
560}