1
2use std::borrow::Cow;
7use std::char;
8use std::str;
9use std::string;
10use std::string::String as StdString;
11
12use self::Token::*;
13
14#[derive(Eq, PartialEq, Debug, Clone, Copy)]
16pub struct Span {
17 pub start: usize,
19 pub end: usize,
21}
22
23impl From<Span> for (usize, usize) {
24 fn from(Span { start, end }: Span) -> (usize, usize) {
25 (start, end)
26 }
27}
28
29#[derive(Eq, PartialEq, Debug)]
30pub enum Token<'a> {
31 Whitespace(&'a str),
32 Newline,
33 Comment(&'a str),
34
35 Equals,
36 Period,
37 Comma,
38 Colon,
39 Plus,
40 LeftBrace,
41 RightBrace,
42 LeftBracket,
43 RightBracket,
44
45 Keylike(&'a str),
46 String {
47 src: &'a str,
48 val: Cow<'a, str>,
49 multiline: bool,
50 },
51}
52
53#[derive(Eq, PartialEq, Debug)]
54pub enum Error {
55 InvalidCharInString(usize, char),
56 InvalidEscape(usize, char),
57 InvalidHexEscape(usize, char),
58 InvalidEscapeValue(usize, u32),
59 NewlineInString(usize),
60 Unexpected(usize, char),
61 UnterminatedString(usize),
62 NewlineInTableKey(usize),
63 MultilineStringKey(usize),
64 EmptyTableKey(usize),
65 Wanted {
66 at: usize,
67 expected: &'static str,
68 found: &'static str,
69 },
70}
71
72#[derive(Clone)]
73pub struct Tokenizer<'a> {
74 input: &'a str,
75 chars: CrlfFold<'a>,
76}
77
78#[derive(Clone)]
79struct CrlfFold<'a> {
80 chars: str::CharIndices<'a>,
81}
82
83#[derive(Debug)]
84enum MaybeString {
85 NotEscaped(usize),
86 Owned(string::String),
87}
88
89impl<'a> Tokenizer<'a> {
90 pub fn new(input: &'a str) -> Tokenizer<'a> {
91 let mut t = Tokenizer {
92 input,
93 chars: CrlfFold {
94 chars: input.char_indices(),
95 },
96 };
97 t.eatc('\u{feff}');
99 t
100 }
101
102 pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
103 let (start, token) = match self.one() {
104 Some((start, '\n')) => (start, Newline),
105 Some((start, ' ')) => (start, self.whitespace_token(start)),
106 Some((start, '\t')) => (start, self.whitespace_token(start)),
107 Some((start, '#')) => (start, self.comment_token(start)),
108 Some((start, '=')) => (start, Equals),
109 Some((start, '.')) => (start, Period),
110 Some((start, ',')) => (start, Comma),
111 Some((start, ':')) => (start, Colon),
112 Some((start, '+')) => (start, Plus),
113 Some((start, '{')) => (start, LeftBrace),
114 Some((start, '}')) => (start, RightBrace),
115 Some((start, '[')) => (start, LeftBracket),
116 Some((start, ']')) => (start, RightBracket),
117 Some((start, '\'')) => {
118 return self
119 .literal_string(start)
120 .map(|t| Some((self.step_span(start), t)))
121 }
122 Some((start, '"')) => {
123 return self
124 .basic_string(start)
125 .map(|t| Some((self.step_span(start), t)))
126 }
127 Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
128
129 Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
130 None => return Ok(None),
131 };
132
133 let span = self.step_span(start);
134 Ok(Some((span, token)))
135 }
136
137 pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
138 self.clone().next()
139 }
140
141 pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
142 self.eat_spanned(expected).map(|s| s.is_some())
143 }
144
145 pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
147 let span = match self.peek()? {
148 Some((span, ref found)) if expected == *found => span,
149 Some(_) => return Ok(None),
150 None => return Ok(None),
151 };
152
153 drop(self.next());
154 Ok(Some(span))
155 }
156
157 pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
158 let _ = self.expect_spanned(expected)?;
160 Ok(())
161 }
162
163 pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
165 let current = self.current();
166 match self.next()? {
167 Some((span, found)) => {
168 if expected == found {
169 Ok(span)
170 } else {
171 Err(Error::Wanted {
172 at: current,
173 expected: expected.describe(),
174 found: found.describe(),
175 })
176 }
177 }
178 None => Err(Error::Wanted {
179 at: self.input.len(),
180 expected: expected.describe(),
181 found: "eof",
182 }),
183 }
184 }
185
186 pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> {
187 let current = self.current();
188 match self.next()? {
189 Some((span, Token::Keylike(k))) => Ok((span, k.into())),
190 Some((
191 span,
192 Token::String {
193 src,
194 val,
195 multiline,
196 },
197 )) => {
198 let offset = self.substr_offset(src);
199 if multiline {
200 return Err(Error::MultilineStringKey(offset));
201 }
202 if val == "" {
203 return Err(Error::EmptyTableKey(offset));
204 }
205 match src.find('\n') {
206 None => Ok((span, val)),
207 Some(i) => Err(Error::NewlineInTableKey(offset + i)),
208 }
209 }
210 Some((_, other)) => Err(Error::Wanted {
211 at: current,
212 expected: "a table key",
213 found: other.describe(),
214 }),
215 None => Err(Error::Wanted {
216 at: self.input.len(),
217 expected: "a table key",
218 found: "eof",
219 }),
220 }
221 }
222
223 pub fn eat_whitespace(&mut self) -> Result<(), Error> {
224 while self.eatc(' ') || self.eatc('\t') {
225 }
227 Ok(())
228 }
229
230 pub fn eat_comment(&mut self) -> Result<bool, Error> {
231 if !self.eatc('#') {
232 return Ok(false);
233 }
234 drop(self.comment_token(0));
235 self.eat_newline_or_eof().map(|()| true)
236 }
237
238 pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
239 let current = self.current();
240 match self.next()? {
241 None | Some((_, Token::Newline)) => Ok(()),
242 Some((_, other)) => Err(Error::Wanted {
243 at: current,
244 expected: "newline",
245 found: other.describe(),
246 }),
247 }
248 }
249
250 pub fn skip_to_newline(&mut self) {
251 loop {
252 match self.one() {
253 Some((_, '\n')) | None => break,
254 _ => {}
255 }
256 }
257 }
258
259 fn eatc(&mut self, ch: char) -> bool {
260 match self.chars.clone().next() {
261 Some((_, ch2)) if ch == ch2 => {
262 self.one();
263 true
264 }
265 _ => false,
266 }
267 }
268
269 pub fn current(&mut self) -> usize {
270 self.chars
271 .clone()
272 .next()
273 .map(|i| i.0)
274 .unwrap_or_else(|| self.input.len())
275 }
276
277 pub fn input(&self) -> &'a str {
278 self.input
279 }
280
281 fn whitespace_token(&mut self, start: usize) -> Token<'a> {
282 while self.eatc(' ') || self.eatc('\t') {
283 }
285 Whitespace(&self.input[start..self.current()])
286 }
287
288 fn comment_token(&mut self, start: usize) -> Token<'a> {
289 while let Some((_, ch)) = self.chars.clone().next() {
290 if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') {
291 break;
292 }
293 self.one();
294 }
295 Comment(&self.input[start..self.current()])
296 }
297
298 fn read_string(
299 &mut self,
300 delim: char,
301 start: usize,
302 new_ch: &mut dyn FnMut(
303 &mut Tokenizer<'_>,
304 &mut MaybeString,
305 bool,
306 usize,
307 char,
308 ) -> Result<(), Error>,
309 ) -> Result<Token<'a>, Error> {
310 let mut multiline = false;
311 if self.eatc(delim) {
312 if self.eatc(delim) {
313 multiline = true;
314 } else {
315 return Ok(String {
316 src: &self.input[start..start + 2],
317 val: Cow::Borrowed(""),
318 multiline: false,
319 });
320 }
321 }
322 let mut val = MaybeString::NotEscaped(self.current());
323 let mut n = 0;
324 'outer: loop {
325 n += 1;
326 match self.one() {
327 Some((i, '\n')) => {
328 if multiline {
329 if self.input.as_bytes()[i] == b'\r' {
330 val.to_owned(&self.input[..i]);
331 }
332 if n == 1 {
333 val = MaybeString::NotEscaped(self.current());
334 } else {
335 val.push('\n');
336 }
337 continue;
338 } else {
339 return Err(Error::NewlineInString(i));
340 }
341 }
342 Some((mut i, ch)) if ch == delim => {
343 if multiline {
344 if !self.eatc(delim) {
345 val.push(delim);
346 continue 'outer;
347 }
348 if !self.eatc(delim) {
349 val.push(delim);
350 val.push(delim);
351 continue 'outer;
352 }
353 if self.eatc(delim) {
354 val.push(delim);
355 i += 1;
356 }
357 if self.eatc(delim) {
358 val.push(delim);
359 i += 1;
360 }
361 }
362 return Ok(String {
363 src: &self.input[start..self.current()],
364 val: val.into_cow(&self.input[..i]),
365 multiline,
366 });
367 }
368 Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
369 None => return Err(Error::UnterminatedString(start)),
370 }
371 }
372 }
373
374 fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
375 self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
376 if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') {
377 val.push(ch);
378 Ok(())
379 } else {
380 Err(Error::InvalidCharInString(i, ch))
381 }
382 })
383 }
384
385 fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
386 self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
387 '\\' => {
388 val.to_owned(&me.input[..i]);
389 match me.chars.next() {
390 Some((_, '"')) => val.push('"'),
391 Some((_, '\\')) => val.push('\\'),
392 Some((_, 'b')) => val.push('\u{8}'),
393 Some((_, 'f')) => val.push('\u{c}'),
394 Some((_, 'n')) => val.push('\n'),
395 Some((_, 'r')) => val.push('\r'),
396 Some((_, 't')) => val.push('\t'),
397 Some((i, c @ 'u')) | Some((i, c @ 'U')) => {
398 let len = if c == 'u' { 4 } else { 8 };
399 val.push(me.hex(start, i, len)?);
400 }
401 Some((i, c @ ' ')) | Some((i, c @ '\t')) | Some((i, c @ '\n')) if multi => {
402 if c != '\n' {
403 while let Some((_, ch)) = me.chars.clone().next() {
404 match ch {
405 ' ' | '\t' => {
406 me.chars.next();
407 continue;
408 }
409 '\n' => {
410 me.chars.next();
411 break;
412 }
413 _ => return Err(Error::InvalidEscape(i, c)),
414 }
415 }
416 }
417 while let Some((_, ch)) = me.chars.clone().next() {
418 match ch {
419 ' ' | '\t' | '\n' => {
420 me.chars.next();
421 }
422 _ => break,
423 }
424 }
425 }
426 Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
427 None => return Err(Error::UnterminatedString(start)),
428 }
429 Ok(())
430 }
431 ch if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') => {
432 val.push(ch);
433 Ok(())
434 }
435 _ => Err(Error::InvalidCharInString(i, ch)),
436 })
437 }
438
439 fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
440 let mut buf = StdString::with_capacity(len);
441 for _ in 0..len {
442 match self.one() {
443 Some((_, ch)) if ch as u32 <= 0x7F && ch.is_digit(16) => buf.push(ch),
444 Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
445 None => return Err(Error::UnterminatedString(start)),
446 }
447 }
448 let val = u32::from_str_radix(&buf, 16).unwrap();
449 match char::from_u32(val) {
450 Some(ch) => Ok(ch),
451 None => Err(Error::InvalidEscapeValue(i, val)),
452 }
453 }
454
455 fn keylike(&mut self, start: usize) -> Token<'a> {
456 while let Some((_, ch)) = self.peek_one() {
457 if !is_keylike(ch) {
458 break;
459 }
460 self.one();
461 }
462 Keylike(&self.input[start..self.current()])
463 }
464
465 pub fn substr_offset(&self, s: &'a str) -> usize {
466 assert!(s.len() <= self.input.len());
467 let a = self.input.as_ptr() as usize;
468 let b = s.as_ptr() as usize;
469 assert!(a <= b);
470 b - a
471 }
472
473 fn step_span(&mut self, start: usize) -> Span {
475 let end = self
476 .peek_one()
477 .map(|t| t.0)
478 .unwrap_or_else(|| self.input.len());
479 Span { start, end }
480 }
481
482 fn peek_one(&mut self) -> Option<(usize, char)> {
484 self.chars.clone().next()
485 }
486
487 pub fn one(&mut self) -> Option<(usize, char)> {
489 self.chars.next()
490 }
491}
492
493impl<'a> Iterator for CrlfFold<'a> {
494 type Item = (usize, char);
495
496 fn next(&mut self) -> Option<(usize, char)> {
497 self.chars.next().map(|(i, c)| {
498 if c == '\r' {
499 let mut attempt = self.chars.clone();
500 if let Some((_, '\n')) = attempt.next() {
501 self.chars = attempt;
502 return (i, '\n');
503 }
504 }
505 (i, c)
506 })
507 }
508}
509
510impl MaybeString {
511 fn push(&mut self, ch: char) {
512 match *self {
513 MaybeString::NotEscaped(..) => {}
514 MaybeString::Owned(ref mut s) => s.push(ch),
515 }
516 }
517
518 fn to_owned(&mut self, input: &str) {
519 match *self {
520 MaybeString::NotEscaped(start) => {
521 *self = MaybeString::Owned(input[start..].to_owned());
522 }
523 MaybeString::Owned(..) => {}
524 }
525 }
526
527 fn into_cow(self, input: &str) -> Cow<'_, str> {
528 match self {
529 MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
530 MaybeString::Owned(s) => Cow::Owned(s),
531 }
532 }
533}
534
535fn is_keylike(ch: char) -> bool {
536 ('A' <= ch && ch <= 'Z')
537 || ('a' <= ch && ch <= 'z')
538 || ('0' <= ch && ch <= '9')
539 || ch == '-'
540 || ch == '_'
541}
542
543impl<'a> Token<'a> {
544 pub fn describe(&self) -> &'static str {
545 match *self {
546 Token::Keylike(_) => "an identifier",
547 Token::Equals => "an equals",
548 Token::Period => "a period",
549 Token::Comment(_) => "a comment",
550 Token::Newline => "a newline",
551 Token::Whitespace(_) => "whitespace",
552 Token::Comma => "a comma",
553 Token::RightBrace => "a right brace",
554 Token::LeftBrace => "a left brace",
555 Token::RightBracket => "a right bracket",
556 Token::LeftBracket => "a left bracket",
557 Token::String { multiline, .. } => {
558 if multiline {
559 "a multiline string"
560 } else {
561 "a string"
562 }
563 }
564 Token::Colon => "a colon",
565 Token::Plus => "a plus",
566 }
567 }
568}
569
570#[cfg(test)]
571mod tests {
572 use super::{Error, Token, Tokenizer};
573 use std::borrow::Cow;
574
575 fn err(input: &str, err: Error) {
576 let mut t = Tokenizer::new(input);
577 let token = t.next().unwrap_err();
578 assert_eq!(token, err);
579 assert!(t.next().unwrap().is_none());
580 }
581
582 #[test]
583 fn literal_strings() {
584 fn t(input: &str, val: &str, multiline: bool) {
585 let mut t = Tokenizer::new(input);
586 let (_, token) = t.next().unwrap().unwrap();
587 assert_eq!(
588 token,
589 Token::String {
590 src: input,
591 val: Cow::Borrowed(val),
592 multiline: multiline,
593 }
594 );
595 assert!(t.next().unwrap().is_none());
596 }
597
598 t("''", "", false);
599 t("''''''", "", true);
600 t("'''\n'''", "", true);
601 t("'a'", "a", false);
602 t("'\"a'", "\"a", false);
603 t("''''a'''", "'a", true);
604 t("'''\n'a\n'''", "'a\n", true);
605 t("'''a\n'a\r\n'''", "a\n'a\n", true);
606 }
607
608 #[test]
609 fn basic_strings() {
610 fn t(input: &str, val: &str, multiline: bool) {
611 let mut t = Tokenizer::new(input);
612 let (_, token) = t.next().unwrap().unwrap();
613 assert_eq!(
614 token,
615 Token::String {
616 src: input,
617 val: Cow::Borrowed(val),
618 multiline: multiline,
619 }
620 );
621 assert!(t.next().unwrap().is_none());
622 }
623
624 t(r#""""#, "", false);
625 t(r#""""""""#, "", true);
626 t(r#""a""#, "a", false);
627 t(r#""""a""""#, "a", true);
628 t(r#""\t""#, "\t", false);
629 t(r#""\u0000""#, "\0", false);
630 t(r#""\U00000000""#, "\0", false);
631 t(r#""\U000A0000""#, "\u{A0000}", false);
632 t(r#""\\t""#, "\\t", false);
633 t("\"\t\"", "\t", false);
634 t("\"\"\"\n\t\"\"\"", "\t", true);
635 t("\"\"\"\\\n\"\"\"", "", true);
636 t(
637 "\"\"\"\\\n \t \t \\\r\n \t \n \t \r\n\"\"\"",
638 "",
639 true,
640 );
641 t(r#""\r""#, "\r", false);
642 t(r#""\n""#, "\n", false);
643 t(r#""\b""#, "\u{8}", false);
644 t(r#""a\fa""#, "a\u{c}a", false);
645 t(r#""\"a""#, "\"a", false);
646 t("\"\"\"\na\"\"\"", "a", true);
647 t("\"\"\"\n\"\"\"", "", true);
648 t(r#""""a\"""b""""#, "a\"\"\"b", true);
649 err(r#""\a"#, Error::InvalidEscape(2, 'a'));
650 err("\"\\\n", Error::InvalidEscape(2, '\n'));
651 err("\"\\\r\n", Error::InvalidEscape(2, '\n'));
652 err("\"\\", Error::UnterminatedString(0));
653 err("\"\u{0}", Error::InvalidCharInString(1, '\u{0}'));
654 err(r#""\U00""#, Error::InvalidHexEscape(5, '"'));
655 err(r#""\U00"#, Error::UnterminatedString(0));
656 err(r#""\uD800"#, Error::InvalidEscapeValue(2, 0xd800));
657 err(r#""\UFFFFFFFF"#, Error::InvalidEscapeValue(2, 0xffff_ffff));
658 }
659
660 #[test]
661 fn keylike() {
662 fn t(input: &str) {
663 let mut t = Tokenizer::new(input);
664 let (_, token) = t.next().unwrap().unwrap();
665 assert_eq!(token, Token::Keylike(input));
666 assert!(t.next().unwrap().is_none());
667 }
668 t("foo");
669 t("0bar");
670 t("bar0");
671 t("1234");
672 t("a-b");
673 t("a_B");
674 t("-_-");
675 t("___");
676 }
677
678 #[test]
679 fn all() {
680 fn t(input: &str, expected: &[((usize, usize), Token<'_>, &str)]) {
681 let mut tokens = Tokenizer::new(input);
682 let mut actual: Vec<((usize, usize), Token<'_>, &str)> = Vec::new();
683 while let Some((span, token)) = tokens.next().unwrap() {
684 actual.push((span.into(), token, &input[span.start..span.end]));
685 }
686 for (a, b) in actual.iter().zip(expected) {
687 assert_eq!(a, b);
688 }
689 assert_eq!(actual.len(), expected.len());
690 }
691
692 t(
693 " a ",
694 &[
695 ((0, 1), Token::Whitespace(" "), " "),
696 ((1, 2), Token::Keylike("a"), "a"),
697 ((2, 3), Token::Whitespace(" "), " "),
698 ],
699 );
700
701 t(
702 " a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ",
703 &[
704 ((0, 1), Token::Whitespace(" "), " "),
705 ((1, 2), Token::Keylike("a"), "a"),
706 ((2, 4), Token::Whitespace("\t "), "\t "),
707 ((4, 5), Token::LeftBracket, "["),
708 ((5, 6), Token::LeftBracket, "["),
709 ((6, 7), Token::RightBracket, "]"),
710 ((7, 8), Token::RightBracket, "]"),
711 ((8, 11), Token::Whitespace(" \t "), " \t "),
712 ((11, 12), Token::LeftBracket, "["),
713 ((12, 13), Token::RightBracket, "]"),
714 ((13, 14), Token::Whitespace(" "), " "),
715 ((14, 15), Token::LeftBrace, "{"),
716 ((15, 16), Token::RightBrace, "}"),
717 ((16, 17), Token::Whitespace(" "), " "),
718 ((17, 18), Token::Comma, ","),
719 ((18, 19), Token::Whitespace(" "), " "),
720 ((19, 20), Token::Period, "."),
721 ((20, 21), Token::Whitespace(" "), " "),
722 ((21, 22), Token::Equals, "="),
723 ((22, 23), Token::Newline, "\n"),
724 ((23, 29), Token::Comment("# foo "), "# foo "),
725 ((29, 31), Token::Newline, "\r\n"),
726 ((31, 36), Token::Comment("#foo "), "#foo "),
727 ((36, 37), Token::Newline, "\n"),
728 ((37, 38), Token::Whitespace(" "), " "),
729 ],
730 );
731 }
732
733 #[test]
734 fn bare_cr_bad() {
735 err("\r", Error::Unexpected(0, '\r'));
736 err("'\n", Error::NewlineInString(1));
737 err("'\u{0}", Error::InvalidCharInString(1, '\u{0}'));
738 err("'", Error::UnterminatedString(0));
739 err("\u{0}", Error::Unexpected(0, '\u{0}'));
740 }
741
742 #[test]
743 fn bad_comment() {
744 let mut t = Tokenizer::new("#\u{0}");
745 t.next().unwrap().unwrap();
746 assert_eq!(t.next(), Err(Error::Unexpected(1, '\u{0}')));
747 assert!(t.next().unwrap().is_none());
748 }
749}