decent_toml_rs_alternative/
tokens.rs

1
2// File taken from Alex Crichton’s toml-rs library
3// https://github.com/alexcrichton/toml-rs/
4// Copyright (c) 2014 Alex Crichton
5
6use std::borrow::Cow;
7use std::char;
8use std::str;
9use std::string;
10use std::string::String as StdString;
11
12use self::Token::*;
13
14/// A span, designating a range of bytes where a token is located.
15#[derive(Eq, PartialEq, Debug, Clone, Copy)]
16pub struct Span {
17    /// The start of the range.
18    pub start: usize,
19    /// The end of the range (exclusive).
20    pub end: usize,
21}
22
23impl From<Span> for (usize, usize) {
24    fn from(Span { start, end }: Span) -> (usize, usize) {
25        (start, end)
26    }
27}
28
29#[derive(Eq, PartialEq, Debug)]
30pub enum Token<'a> {
31    Whitespace(&'a str),
32    Newline,
33    Comment(&'a str),
34
35    Equals,
36    Period,
37    Comma,
38    Colon,
39    Plus,
40    LeftBrace,
41    RightBrace,
42    LeftBracket,
43    RightBracket,
44
45    Keylike(&'a str),
46    String {
47        src: &'a str,
48        val: Cow<'a, str>,
49        multiline: bool,
50    },
51}
52
53#[derive(Eq, PartialEq, Debug)]
54pub enum Error {
55    InvalidCharInString(usize, char),
56    InvalidEscape(usize, char),
57    InvalidHexEscape(usize, char),
58    InvalidEscapeValue(usize, u32),
59    NewlineInString(usize),
60    Unexpected(usize, char),
61    UnterminatedString(usize),
62    NewlineInTableKey(usize),
63    MultilineStringKey(usize),
64    EmptyTableKey(usize),
65    Wanted {
66        at: usize,
67        expected: &'static str,
68        found: &'static str,
69    },
70}
71
72#[derive(Clone)]
73pub struct Tokenizer<'a> {
74    input: &'a str,
75    chars: CrlfFold<'a>,
76}
77
78#[derive(Clone)]
79struct CrlfFold<'a> {
80    chars: str::CharIndices<'a>,
81}
82
83#[derive(Debug)]
84enum MaybeString {
85    NotEscaped(usize),
86    Owned(string::String),
87}
88
89impl<'a> Tokenizer<'a> {
90    pub fn new(input: &'a str) -> Tokenizer<'a> {
91        let mut t = Tokenizer {
92            input,
93            chars: CrlfFold {
94                chars: input.char_indices(),
95            },
96        };
97        // Eat utf-8 BOM
98        t.eatc('\u{feff}');
99        t
100    }
101
102    pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
103        let (start, token) = match self.one() {
104            Some((start, '\n')) => (start, Newline),
105            Some((start, ' ')) => (start, self.whitespace_token(start)),
106            Some((start, '\t')) => (start, self.whitespace_token(start)),
107            Some((start, '#')) => (start, self.comment_token(start)),
108            Some((start, '=')) => (start, Equals),
109            Some((start, '.')) => (start, Period),
110            Some((start, ',')) => (start, Comma),
111            Some((start, ':')) => (start, Colon),
112            Some((start, '+')) => (start, Plus),
113            Some((start, '{')) => (start, LeftBrace),
114            Some((start, '}')) => (start, RightBrace),
115            Some((start, '[')) => (start, LeftBracket),
116            Some((start, ']')) => (start, RightBracket),
117            Some((start, '\'')) => {
118                return self
119                    .literal_string(start)
120                    .map(|t| Some((self.step_span(start), t)))
121            }
122            Some((start, '"')) => {
123                return self
124                    .basic_string(start)
125                    .map(|t| Some((self.step_span(start), t)))
126            }
127            Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
128
129            Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
130            None => return Ok(None),
131        };
132
133        let span = self.step_span(start);
134        Ok(Some((span, token)))
135    }
136
137    pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
138        self.clone().next()
139    }
140
141    pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
142        self.eat_spanned(expected).map(|s| s.is_some())
143    }
144
145    /// Eat a value, returning it's span if it was consumed.
146    pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
147        let span = match self.peek()? {
148            Some((span, ref found)) if expected == *found => span,
149            Some(_) => return Ok(None),
150            None => return Ok(None),
151        };
152
153        drop(self.next());
154        Ok(Some(span))
155    }
156
157    pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
158        // ignore span
159        let _ = self.expect_spanned(expected)?;
160        Ok(())
161    }
162
163    /// Expect the given token returning its span.
164    pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
165        let current = self.current();
166        match self.next()? {
167            Some((span, found)) => {
168                if expected == found {
169                    Ok(span)
170                } else {
171                    Err(Error::Wanted {
172                        at: current,
173                        expected: expected.describe(),
174                        found: found.describe(),
175                    })
176                }
177            }
178            None => Err(Error::Wanted {
179                at: self.input.len(),
180                expected: expected.describe(),
181                found: "eof",
182            }),
183        }
184    }
185
186    pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> {
187        let current = self.current();
188        match self.next()? {
189            Some((span, Token::Keylike(k))) => Ok((span, k.into())),
190            Some((
191                span,
192                Token::String {
193                    src,
194                    val,
195                    multiline,
196                },
197            )) => {
198                let offset = self.substr_offset(src);
199                if multiline {
200                    return Err(Error::MultilineStringKey(offset));
201                }
202                if val == "" {
203                    return Err(Error::EmptyTableKey(offset));
204                }
205                match src.find('\n') {
206                    None => Ok((span, val)),
207                    Some(i) => Err(Error::NewlineInTableKey(offset + i)),
208                }
209            }
210            Some((_, other)) => Err(Error::Wanted {
211                at: current,
212                expected: "a table key",
213                found: other.describe(),
214            }),
215            None => Err(Error::Wanted {
216                at: self.input.len(),
217                expected: "a table key",
218                found: "eof",
219            }),
220        }
221    }
222
223    pub fn eat_whitespace(&mut self) -> Result<(), Error> {
224        while self.eatc(' ') || self.eatc('\t') {
225            // ...
226        }
227        Ok(())
228    }
229
230    pub fn eat_comment(&mut self) -> Result<bool, Error> {
231        if !self.eatc('#') {
232            return Ok(false);
233        }
234        drop(self.comment_token(0));
235        self.eat_newline_or_eof().map(|()| true)
236    }
237
238    pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
239        let current = self.current();
240        match self.next()? {
241            None | Some((_, Token::Newline)) => Ok(()),
242            Some((_, other)) => Err(Error::Wanted {
243                at: current,
244                expected: "newline",
245                found: other.describe(),
246            }),
247        }
248    }
249
250    pub fn skip_to_newline(&mut self) {
251        loop {
252            match self.one() {
253                Some((_, '\n')) | None => break,
254                _ => {}
255            }
256        }
257    }
258
259    fn eatc(&mut self, ch: char) -> bool {
260        match self.chars.clone().next() {
261            Some((_, ch2)) if ch == ch2 => {
262                self.one();
263                true
264            }
265            _ => false,
266        }
267    }
268
269    pub fn current(&mut self) -> usize {
270        self.chars
271            .clone()
272            .next()
273            .map(|i| i.0)
274            .unwrap_or_else(|| self.input.len())
275    }
276
277    pub fn input(&self) -> &'a str {
278        self.input
279    }
280
281    fn whitespace_token(&mut self, start: usize) -> Token<'a> {
282        while self.eatc(' ') || self.eatc('\t') {
283            // ...
284        }
285        Whitespace(&self.input[start..self.current()])
286    }
287
288    fn comment_token(&mut self, start: usize) -> Token<'a> {
289        while let Some((_, ch)) = self.chars.clone().next() {
290            if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') {
291                break;
292            }
293            self.one();
294        }
295        Comment(&self.input[start..self.current()])
296    }
297
298    fn read_string(
299        &mut self,
300        delim: char,
301        start: usize,
302        new_ch: &mut dyn FnMut(
303            &mut Tokenizer<'_>,
304            &mut MaybeString,
305            bool,
306            usize,
307            char,
308        ) -> Result<(), Error>,
309    ) -> Result<Token<'a>, Error> {
310        let mut multiline = false;
311        if self.eatc(delim) {
312            if self.eatc(delim) {
313                multiline = true;
314            } else {
315                return Ok(String {
316                    src: &self.input[start..start + 2],
317                    val: Cow::Borrowed(""),
318                    multiline: false,
319                });
320            }
321        }
322        let mut val = MaybeString::NotEscaped(self.current());
323        let mut n = 0;
324        'outer: loop {
325            n += 1;
326            match self.one() {
327                Some((i, '\n')) => {
328                    if multiline {
329                        if self.input.as_bytes()[i] == b'\r' {
330                            val.to_owned(&self.input[..i]);
331                        }
332                        if n == 1 {
333                            val = MaybeString::NotEscaped(self.current());
334                        } else {
335                            val.push('\n');
336                        }
337                        continue;
338                    } else {
339                        return Err(Error::NewlineInString(i));
340                    }
341                }
342                Some((mut i, ch)) if ch == delim => {
343                    if multiline {
344                        if !self.eatc(delim) {
345                            val.push(delim);
346                            continue 'outer;
347                        }
348                        if !self.eatc(delim) {
349                            val.push(delim);
350                            val.push(delim);
351                            continue 'outer;
352                        }
353                        if self.eatc(delim) {
354                            val.push(delim);
355                            i += 1;
356                        }
357                        if self.eatc(delim) {
358                            val.push(delim);
359                            i += 1;
360                        }
361                    }
362                    return Ok(String {
363                        src: &self.input[start..self.current()],
364                        val: val.into_cow(&self.input[..i]),
365                        multiline,
366                    });
367                }
368                Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
369                None => return Err(Error::UnterminatedString(start)),
370            }
371        }
372    }
373
374    fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
375        self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
376            if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') {
377                val.push(ch);
378                Ok(())
379            } else {
380                Err(Error::InvalidCharInString(i, ch))
381            }
382        })
383    }
384
385    fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
386        self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
387            '\\' => {
388                val.to_owned(&me.input[..i]);
389                match me.chars.next() {
390                    Some((_, '"')) => val.push('"'),
391                    Some((_, '\\')) => val.push('\\'),
392                    Some((_, 'b')) => val.push('\u{8}'),
393                    Some((_, 'f')) => val.push('\u{c}'),
394                    Some((_, 'n')) => val.push('\n'),
395                    Some((_, 'r')) => val.push('\r'),
396                    Some((_, 't')) => val.push('\t'),
397                    Some((i, c @ 'u')) | Some((i, c @ 'U')) => {
398                        let len = if c == 'u' { 4 } else { 8 };
399                        val.push(me.hex(start, i, len)?);
400                    }
401                    Some((i, c @ ' ')) | Some((i, c @ '\t')) | Some((i, c @ '\n')) if multi => {
402                        if c != '\n' {
403                            while let Some((_, ch)) = me.chars.clone().next() {
404                                match ch {
405                                    ' ' | '\t' => {
406                                        me.chars.next();
407                                        continue;
408                                    }
409                                    '\n' => {
410                                        me.chars.next();
411                                        break;
412                                    }
413                                    _ => return Err(Error::InvalidEscape(i, c)),
414                                }
415                            }
416                        }
417                        while let Some((_, ch)) = me.chars.clone().next() {
418                            match ch {
419                                ' ' | '\t' | '\n' => {
420                                    me.chars.next();
421                                }
422                                _ => break,
423                            }
424                        }
425                    }
426                    Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
427                    None => return Err(Error::UnterminatedString(start)),
428                }
429                Ok(())
430            }
431            ch if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') => {
432                val.push(ch);
433                Ok(())
434            }
435            _ => Err(Error::InvalidCharInString(i, ch)),
436        })
437    }
438
439    fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
440        let mut buf = StdString::with_capacity(len);
441        for _ in 0..len {
442            match self.one() {
443                Some((_, ch)) if ch as u32 <= 0x7F && ch.is_digit(16) => buf.push(ch),
444                Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
445                None => return Err(Error::UnterminatedString(start)),
446            }
447        }
448        let val = u32::from_str_radix(&buf, 16).unwrap();
449        match char::from_u32(val) {
450            Some(ch) => Ok(ch),
451            None => Err(Error::InvalidEscapeValue(i, val)),
452        }
453    }
454
455    fn keylike(&mut self, start: usize) -> Token<'a> {
456        while let Some((_, ch)) = self.peek_one() {
457            if !is_keylike(ch) {
458                break;
459            }
460            self.one();
461        }
462        Keylike(&self.input[start..self.current()])
463    }
464
465    pub fn substr_offset(&self, s: &'a str) -> usize {
466        assert!(s.len() <= self.input.len());
467        let a = self.input.as_ptr() as usize;
468        let b = s.as_ptr() as usize;
469        assert!(a <= b);
470        b - a
471    }
472
473    /// Calculate the span of a single character.
474    fn step_span(&mut self, start: usize) -> Span {
475        let end = self
476            .peek_one()
477            .map(|t| t.0)
478            .unwrap_or_else(|| self.input.len());
479        Span { start, end }
480    }
481
482    /// Peek one char without consuming it.
483    fn peek_one(&mut self) -> Option<(usize, char)> {
484        self.chars.clone().next()
485    }
486
487    /// Take one char.
488    pub fn one(&mut self) -> Option<(usize, char)> {
489        self.chars.next()
490    }
491}
492
493impl<'a> Iterator for CrlfFold<'a> {
494    type Item = (usize, char);
495
496    fn next(&mut self) -> Option<(usize, char)> {
497        self.chars.next().map(|(i, c)| {
498            if c == '\r' {
499                let mut attempt = self.chars.clone();
500                if let Some((_, '\n')) = attempt.next() {
501                    self.chars = attempt;
502                    return (i, '\n');
503                }
504            }
505            (i, c)
506        })
507    }
508}
509
510impl MaybeString {
511    fn push(&mut self, ch: char) {
512        match *self {
513            MaybeString::NotEscaped(..) => {}
514            MaybeString::Owned(ref mut s) => s.push(ch),
515        }
516    }
517
518    fn to_owned(&mut self, input: &str) {
519        match *self {
520            MaybeString::NotEscaped(start) => {
521                *self = MaybeString::Owned(input[start..].to_owned());
522            }
523            MaybeString::Owned(..) => {}
524        }
525    }
526
527    fn into_cow(self, input: &str) -> Cow<'_, str> {
528        match self {
529            MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
530            MaybeString::Owned(s) => Cow::Owned(s),
531        }
532    }
533}
534
535fn is_keylike(ch: char) -> bool {
536    ('A' <= ch && ch <= 'Z')
537        || ('a' <= ch && ch <= 'z')
538        || ('0' <= ch && ch <= '9')
539        || ch == '-'
540        || ch == '_'
541}
542
543impl<'a> Token<'a> {
544    pub fn describe(&self) -> &'static str {
545        match *self {
546            Token::Keylike(_) => "an identifier",
547            Token::Equals => "an equals",
548            Token::Period => "a period",
549            Token::Comment(_) => "a comment",
550            Token::Newline => "a newline",
551            Token::Whitespace(_) => "whitespace",
552            Token::Comma => "a comma",
553            Token::RightBrace => "a right brace",
554            Token::LeftBrace => "a left brace",
555            Token::RightBracket => "a right bracket",
556            Token::LeftBracket => "a left bracket",
557            Token::String { multiline, .. } => {
558                if multiline {
559                    "a multiline string"
560                } else {
561                    "a string"
562                }
563            }
564            Token::Colon => "a colon",
565            Token::Plus => "a plus",
566        }
567    }
568}
569
570#[cfg(test)]
571mod tests {
572    use super::{Error, Token, Tokenizer};
573    use std::borrow::Cow;
574
575    fn err(input: &str, err: Error) {
576        let mut t = Tokenizer::new(input);
577        let token = t.next().unwrap_err();
578        assert_eq!(token, err);
579        assert!(t.next().unwrap().is_none());
580    }
581
582    #[test]
583    fn literal_strings() {
584        fn t(input: &str, val: &str, multiline: bool) {
585            let mut t = Tokenizer::new(input);
586            let (_, token) = t.next().unwrap().unwrap();
587            assert_eq!(
588                token,
589                Token::String {
590                    src: input,
591                    val: Cow::Borrowed(val),
592                    multiline: multiline,
593                }
594            );
595            assert!(t.next().unwrap().is_none());
596        }
597
598        t("''", "", false);
599        t("''''''", "", true);
600        t("'''\n'''", "", true);
601        t("'a'", "a", false);
602        t("'\"a'", "\"a", false);
603        t("''''a'''", "'a", true);
604        t("'''\n'a\n'''", "'a\n", true);
605        t("'''a\n'a\r\n'''", "a\n'a\n", true);
606    }
607
608    #[test]
609    fn basic_strings() {
610        fn t(input: &str, val: &str, multiline: bool) {
611            let mut t = Tokenizer::new(input);
612            let (_, token) = t.next().unwrap().unwrap();
613            assert_eq!(
614                token,
615                Token::String {
616                    src: input,
617                    val: Cow::Borrowed(val),
618                    multiline: multiline,
619                }
620            );
621            assert!(t.next().unwrap().is_none());
622        }
623
624        t(r#""""#, "", false);
625        t(r#""""""""#, "", true);
626        t(r#""a""#, "a", false);
627        t(r#""""a""""#, "a", true);
628        t(r#""\t""#, "\t", false);
629        t(r#""\u0000""#, "\0", false);
630        t(r#""\U00000000""#, "\0", false);
631        t(r#""\U000A0000""#, "\u{A0000}", false);
632        t(r#""\\t""#, "\\t", false);
633        t("\"\t\"", "\t", false);
634        t("\"\"\"\n\t\"\"\"", "\t", true);
635        t("\"\"\"\\\n\"\"\"", "", true);
636        t(
637            "\"\"\"\\\n     \t   \t  \\\r\n  \t \n  \t \r\n\"\"\"",
638            "",
639            true,
640        );
641        t(r#""\r""#, "\r", false);
642        t(r#""\n""#, "\n", false);
643        t(r#""\b""#, "\u{8}", false);
644        t(r#""a\fa""#, "a\u{c}a", false);
645        t(r#""\"a""#, "\"a", false);
646        t("\"\"\"\na\"\"\"", "a", true);
647        t("\"\"\"\n\"\"\"", "", true);
648        t(r#""""a\"""b""""#, "a\"\"\"b", true);
649        err(r#""\a"#, Error::InvalidEscape(2, 'a'));
650        err("\"\\\n", Error::InvalidEscape(2, '\n'));
651        err("\"\\\r\n", Error::InvalidEscape(2, '\n'));
652        err("\"\\", Error::UnterminatedString(0));
653        err("\"\u{0}", Error::InvalidCharInString(1, '\u{0}'));
654        err(r#""\U00""#, Error::InvalidHexEscape(5, '"'));
655        err(r#""\U00"#, Error::UnterminatedString(0));
656        err(r#""\uD800"#, Error::InvalidEscapeValue(2, 0xd800));
657        err(r#""\UFFFFFFFF"#, Error::InvalidEscapeValue(2, 0xffff_ffff));
658    }
659
660    #[test]
661    fn keylike() {
662        fn t(input: &str) {
663            let mut t = Tokenizer::new(input);
664            let (_, token) = t.next().unwrap().unwrap();
665            assert_eq!(token, Token::Keylike(input));
666            assert!(t.next().unwrap().is_none());
667        }
668        t("foo");
669        t("0bar");
670        t("bar0");
671        t("1234");
672        t("a-b");
673        t("a_B");
674        t("-_-");
675        t("___");
676    }
677
678    #[test]
679    fn all() {
680        fn t(input: &str, expected: &[((usize, usize), Token<'_>, &str)]) {
681            let mut tokens = Tokenizer::new(input);
682            let mut actual: Vec<((usize, usize), Token<'_>, &str)> = Vec::new();
683            while let Some((span, token)) = tokens.next().unwrap() {
684                actual.push((span.into(), token, &input[span.start..span.end]));
685            }
686            for (a, b) in actual.iter().zip(expected) {
687                assert_eq!(a, b);
688            }
689            assert_eq!(actual.len(), expected.len());
690        }
691
692        t(
693            " a ",
694            &[
695                ((0, 1), Token::Whitespace(" "), " "),
696                ((1, 2), Token::Keylike("a"), "a"),
697                ((2, 3), Token::Whitespace(" "), " "),
698            ],
699        );
700
701        t(
702            " a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ",
703            &[
704                ((0, 1), Token::Whitespace(" "), " "),
705                ((1, 2), Token::Keylike("a"), "a"),
706                ((2, 4), Token::Whitespace("\t "), "\t "),
707                ((4, 5), Token::LeftBracket, "["),
708                ((5, 6), Token::LeftBracket, "["),
709                ((6, 7), Token::RightBracket, "]"),
710                ((7, 8), Token::RightBracket, "]"),
711                ((8, 11), Token::Whitespace(" \t "), " \t "),
712                ((11, 12), Token::LeftBracket, "["),
713                ((12, 13), Token::RightBracket, "]"),
714                ((13, 14), Token::Whitespace(" "), " "),
715                ((14, 15), Token::LeftBrace, "{"),
716                ((15, 16), Token::RightBrace, "}"),
717                ((16, 17), Token::Whitespace(" "), " "),
718                ((17, 18), Token::Comma, ","),
719                ((18, 19), Token::Whitespace(" "), " "),
720                ((19, 20), Token::Period, "."),
721                ((20, 21), Token::Whitespace(" "), " "),
722                ((21, 22), Token::Equals, "="),
723                ((22, 23), Token::Newline, "\n"),
724                ((23, 29), Token::Comment("# foo "), "# foo "),
725                ((29, 31), Token::Newline, "\r\n"),
726                ((31, 36), Token::Comment("#foo "), "#foo "),
727                ((36, 37), Token::Newline, "\n"),
728                ((37, 38), Token::Whitespace(" "), " "),
729            ],
730        );
731    }
732
733    #[test]
734    fn bare_cr_bad() {
735        err("\r", Error::Unexpected(0, '\r'));
736        err("'\n", Error::NewlineInString(1));
737        err("'\u{0}", Error::InvalidCharInString(1, '\u{0}'));
738        err("'", Error::UnterminatedString(0));
739        err("\u{0}", Error::Unexpected(0, '\u{0}'));
740    }
741
742    #[test]
743    fn bad_comment() {
744        let mut t = Tokenizer::new("#\u{0}");
745        t.next().unwrap().unwrap();
746        assert_eq!(t.next(), Err(Error::Unexpected(1, '\u{0}')));
747        assert!(t.next().unwrap().is_none());
748    }
749}