1#![allow(missing_docs)]
2use crate::{Span, value::Key};
5use std::{borrow::Cow, char, str};
6
7#[derive(Eq, PartialEq, Debug)]
8pub enum Token<'a> {
9 Whitespace(&'a str),
10 Newline,
11 Comment(&'a str),
12
13 Equals,
14 Period,
15 Comma,
16 Colon,
17 Plus,
18 LeftBrace,
19 RightBrace,
20 LeftBracket,
21 RightBracket,
22
23 Keylike(&'a str),
24 String {
25 src: &'a str,
26 val: Cow<'a, str>,
27 multiline: bool,
28 },
29}
30
31#[derive(Eq, PartialEq, Debug)]
32pub enum Error {
33 InvalidCharInString(usize, char),
34 InvalidEscape(usize, char),
35 InvalidHexEscape(usize, char),
36 InvalidEscapeValue(usize, usize, u32),
37 NewlineInString(usize),
38 Unexpected(usize, char),
39 UnterminatedString(usize),
40 MultilineStringKey(usize, usize),
41 Wanted {
42 at: usize,
43 expected: &'static str,
44 found: &'static str,
45 },
46}
47
48#[derive(Clone)]
49pub struct Tokenizer<'a> {
50 input: &'a str,
51 chars: CrlfFold<'a>,
52}
53
54#[derive(Clone)]
55struct CrlfFold<'a> {
56 chars: str::CharIndices<'a>,
57}
58
59#[derive(Debug)]
60enum MaybeString {
61 NotEscaped(usize),
62 Owned(String),
63}
64
65impl<'a> Tokenizer<'a> {
66 pub fn new(input: &'a str) -> Tokenizer<'a> {
67 let mut t = Tokenizer {
68 input,
69 chars: CrlfFold {
70 chars: input.char_indices(),
71 },
72 };
73 t.eatc('\u{feff}');
75 t
76 }
77
78 pub fn step(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
79 let (start, token) = match self.one() {
80 Some((start, '\n')) => (start, Token::Newline),
81 Some((start, ' ' | '\t')) => (start, self.whitespace_token(start)),
82 Some((start, '#')) => (start, self.comment_token(start)),
83 Some((start, '=')) => (start, Token::Equals),
84 Some((start, '.')) => (start, Token::Period),
85 Some((start, ',')) => (start, Token::Comma),
86 Some((start, ':')) => (start, Token::Colon),
87 Some((start, '+')) => (start, Token::Plus),
88 Some((start, '{')) => (start, Token::LeftBrace),
89 Some((start, '}')) => (start, Token::RightBrace),
90 Some((start, '[')) => (start, Token::LeftBracket),
91 Some((start, ']')) => (start, Token::RightBracket),
92 Some((start, '\'')) => return self.literal_string(start).map(|(s, t)| Some((s, t))),
93 Some((start, '"')) => return self.basic_string(start).map(|(s, t)| Some((s, t))),
94 Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
95 Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
96 None => return Ok(None),
97 };
98
99 let span = self.step_span(start);
100 Ok(Some((span, token)))
101 }
102
103 pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
104 self.clone().step()
105 }
106
107 pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
108 self.eat_spanned(expected).map(|s| s.is_some())
109 }
110
111 pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
113 let span = match self.peek()? {
114 Some((span, ref found)) if expected == *found => span,
115 Some(_) | None => return Ok(None),
116 };
117
118 drop(self.step());
119 Ok(Some(span))
120 }
121
122 pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
123 let _ = self.expect_spanned(expected)?;
125 Ok(())
126 }
127
128 pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
130 let current = self.current();
131 match self.step()? {
132 Some((span, found)) => {
133 if expected == found {
134 Ok(span)
135 } else {
136 Err(Error::Wanted {
137 at: current,
138 expected: expected.describe(),
139 found: found.describe(),
140 })
141 }
142 }
143 None => Err(Error::Wanted {
144 at: self.input.len(),
145 expected: expected.describe(),
146 found: "eof",
147 }),
148 }
149 }
150
151 pub fn table_key(&mut self) -> Result<Key<'a>, Error> {
152 let current = self.current();
153 match self.step()? {
154 Some((span, Token::Keylike(k))) => Ok(Key {
155 span,
156 name: k.into(),
157 }),
158 Some((
159 span,
160 Token::String {
161 src,
162 val,
163 multiline,
164 ..
165 },
166 )) => {
167 let offset = self.substr_offset(src);
168 if multiline {
169 return Err(Error::MultilineStringKey(offset, offset + val.len()));
170 }
171 match src.find('\n') {
172 None => Ok(Key { span, name: val }),
173 Some(i) => Err(Error::InvalidCharInString(i, '\n')),
175 }
176 }
177 Some((_, other)) => Err(Error::Wanted {
178 at: current,
179 expected: "a table key",
180 found: other.describe(),
181 }),
182 None => Err(Error::Wanted {
183 at: self.input.len(),
184 expected: "a table key",
185 found: "eof",
186 }),
187 }
188 }
189
190 pub fn eat_whitespace(&mut self) {
191 while self.eatc(' ') || self.eatc('\t') {
192 }
194 }
195
196 pub fn eat_comment(&mut self) -> Result<bool, Error> {
197 if !self.eatc('#') {
198 return Ok(false);
199 }
200 drop(self.comment_token(0));
201 self.eat_newline_or_eof().map(|()| true)
202 }
203
204 pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
205 let current = self.current();
206 match self.step()? {
207 None | Some((_, Token::Newline)) => Ok(()),
208 Some((_, other)) => Err(Error::Wanted {
209 at: current,
210 expected: "newline",
211 found: other.describe(),
212 }),
213 }
214 }
215
216 pub fn skip_to_newline(&mut self) {
217 loop {
218 match self.one() {
219 Some((_, '\n')) | None => break,
220 _ => {}
221 }
222 }
223 }
224
225 fn eatc(&mut self, ch: char) -> bool {
226 match self.chars.clone().next() {
227 Some((_, ch2)) if ch == ch2 => {
228 self.one();
229 true
230 }
231 _ => false,
232 }
233 }
234
235 pub fn current(&mut self) -> usize {
236 match self.chars.clone().next() {
237 Some(i) => i.0,
238 None => self.input.len(),
239 }
240 }
241
242 fn whitespace_token(&mut self, start: usize) -> Token<'a> {
243 while self.eatc(' ') || self.eatc('\t') {
244 }
246 Token::Whitespace(&self.input[start..self.current()])
247 }
248
249 fn comment_token(&mut self, start: usize) -> Token<'a> {
250 while let Some((_, ch)) = self.chars.clone().next() {
251 if ch != '\t' && !('\u{20}'..='\u{10ffff}').contains(&ch) {
252 break;
253 }
254 self.one();
255 }
256 Token::Comment(&self.input[start..self.current()])
257 }
258
259 #[allow(clippy::type_complexity)]
265 fn read_string(
266 &mut self,
267 delim: char,
268 start: usize,
269 new_ch: &mut dyn FnMut(
270 &mut Tokenizer<'_>,
271 &mut MaybeString,
272 bool,
273 usize,
274 char,
275 ) -> Result<(), Error>,
276 ) -> Result<(Span, Token<'a>), Error> {
277 let mut multiline = false;
278 if self.eatc(delim) {
279 if self.eatc(delim) {
280 multiline = true;
281 } else {
282 return Ok((
283 Span::new(start as u32, (start + 1) as u32),
286 Token::String {
287 src: &self.input[start..start + 2],
288 val: Cow::Borrowed(""),
289 multiline: false,
290 },
291 ));
292 }
293 }
294 let mut val = MaybeString::NotEscaped(self.current());
295 let mut n = 0;
296 loop {
297 n += 1;
298 match self.one() {
299 Some((i, '\n')) => {
300 if multiline {
301 if self.input.as_bytes()[i] == b'\r' {
302 val.make_owned(&self.input[..i]);
303 }
304 if n == 1 {
305 val = MaybeString::NotEscaped(self.current());
306 } else {
307 val.push('\n');
308 }
309 } else {
310 return Err(Error::NewlineInString(i));
311 }
312 }
313 Some((mut i, ch)) if ch == delim => {
314 let span = if multiline {
315 if !self.eatc(delim) {
316 val.push(delim);
317 continue;
318 }
319 if !self.eatc(delim) {
320 val.push(delim);
321 val.push(delim);
322 continue;
323 }
324 if self.eatc(delim) {
325 val.push(delim);
326 i += 1;
327 }
328 if self.eatc(delim) {
329 val.push(delim);
330 i += 1;
331 }
332
333 let maybe_nl = self.input.as_bytes()[start + 3];
335 let start_off = if maybe_nl == b'\n' {
336 4
337 } else if maybe_nl == b'\r' {
338 5
339 } else {
340 3
341 };
342
343 Span::new((start + start_off) as u32, (self.current() - 3) as u32)
344 } else {
345 Span::new((start + 1) as u32, (self.current() - 1) as u32)
346 };
347
348 return Ok((
349 span,
350 Token::String {
351 src: &self.input[start..self.current()],
352 val: val.into_cow(&self.input[..i]),
353 multiline,
354 },
355 ));
356 }
357 Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
358 None => return Err(Error::UnterminatedString(start)),
359 }
360 }
361 }
362
363 fn literal_string(&mut self, start: usize) -> Result<(Span, Token<'a>), Error> {
364 self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
365 if ch == '\u{09}' || (ch != '\u{7f}' && ('\u{20}'..='\u{10ffff}').contains(&ch)) {
366 val.push(ch);
367 Ok(())
368 } else {
369 Err(Error::InvalidCharInString(i, ch))
370 }
371 })
372 }
373
374 fn basic_string(&mut self, start: usize) -> Result<(Span, Token<'a>), Error> {
375 self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
376 '\\' => {
377 val.make_owned(&me.input[..i]);
378 match me.chars.next() {
379 Some((_, '"')) => val.push('"'),
380 Some((_, '\\')) => val.push('\\'),
381 Some((_, 'b')) => val.push('\u{8}'),
382 Some((_, 'f')) => val.push('\u{c}'),
383 Some((_, 'n')) => val.push('\n'),
384 Some((_, 'r')) => val.push('\r'),
385 Some((_, 't')) => val.push('\t'),
386 Some((i, c @ ('u' | 'U'))) => {
387 let c = if c == 'u' {
388 me.hex::<4>(start, i)
389 } else {
390 me.hex::<8>(start, i)
391 };
392 val.push(c?);
393 }
394 Some((i, c @ (' ' | '\t' | '\n'))) if multi => {
395 if c != '\n' {
396 while let Some((_, ch)) = me.chars.clone().next() {
397 match ch {
398 ' ' | '\t' => {
399 me.chars.next();
400 }
401 '\n' => {
402 me.chars.next();
403 break;
404 }
405 _ => return Err(Error::InvalidEscape(i, c)),
406 }
407 }
408 }
409 while let Some((_, ch)) = me.chars.clone().next() {
410 match ch {
411 ' ' | '\t' | '\n' => {
412 me.chars.next();
413 }
414 _ => break,
415 }
416 }
417 }
418 Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
419 None => return Err(Error::UnterminatedString(start)),
420 }
421 Ok(())
422 }
423 ch if ch == '\u{09}' || (ch != '\u{7f}' && ('\u{20}'..='\u{10ffff}').contains(&ch)) => {
424 val.push(ch);
425 Ok(())
426 }
427 _ => Err(Error::InvalidCharInString(i, ch)),
428 })
429 }
430
431 fn hex<const N: usize>(&mut self, start: usize, i: usize) -> Result<char, Error> {
432 let mut buf = [0; N];
433 for b in buf.iter_mut() {
434 match self.one() {
435 Some((_, ch)) if ch as u32 <= 0x7F && ch.is_ascii_hexdigit() => *b = ch as u8,
436 Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
437 None => return Err(Error::UnterminatedString(start)),
438 }
439 }
440 let val = u32::from_str_radix(std::str::from_utf8(&buf).unwrap(), 16).unwrap();
441 match char::from_u32(val) {
442 Some(ch) => Ok(ch),
443 None => Err(Error::InvalidEscapeValue(i, N, val)),
444 }
445 }
446
447 fn keylike(&mut self, start: usize) -> Token<'a> {
448 while let Some((_, ch)) = self.peek_one() {
449 if !is_keylike(ch) {
450 break;
451 }
452 self.one();
453 }
454 Token::Keylike(&self.input[start..self.current()])
455 }
456
457 pub fn substr_offset(&self, s: &'a str) -> usize {
458 assert!(s.len() <= self.input.len());
459 let a = self.input.as_ptr() as usize;
460 let b = s.as_ptr() as usize;
461 assert!(a <= b);
462 b - a
463 }
464
465 fn step_span(&mut self, start: usize) -> Span {
467 let end = match self.peek_one() {
468 Some(t) => t.0,
469 None => self.input.len(),
470 };
471 Span {
472 start: start as u32,
473 end: end as u32,
474 }
475 }
476
477 fn peek_one(&mut self) -> Option<(usize, char)> {
479 self.chars.clone().next()
480 }
481
482 pub fn one(&mut self) -> Option<(usize, char)> {
484 self.chars.next()
485 }
486}
487
488impl Iterator for CrlfFold<'_> {
489 type Item = (usize, char);
490
491 fn next(&mut self) -> Option<(usize, char)> {
492 self.chars.next().map(|(i, c)| {
493 if c == '\r' {
494 let mut attempt = self.chars.clone();
495 if let Some((_, '\n')) = attempt.next() {
496 self.chars = attempt;
497 return (i, '\n');
498 }
499 }
500 (i, c)
501 })
502 }
503}
504
505impl MaybeString {
506 fn push(&mut self, ch: char) {
507 match *self {
508 MaybeString::NotEscaped(..) => {}
509 MaybeString::Owned(ref mut s) => s.push(ch),
510 }
511 }
512
513 fn make_owned(&mut self, input: &str) {
514 match *self {
515 MaybeString::NotEscaped(start) => {
516 *self = MaybeString::Owned(input[start..].to_owned());
517 }
518 MaybeString::Owned(..) => {}
519 }
520 }
521
522 fn into_cow(self, input: &str) -> Cow<'_, str> {
523 match self {
524 MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
525 MaybeString::Owned(s) => Cow::Owned(s),
526 }
527 }
528}
529
530#[inline]
531fn is_keylike(ch: char) -> bool {
532 ch.is_ascii_alphanumeric() || ch == '-' || ch == '_'
533}
534
535impl Token<'_> {
536 pub fn describe(&self) -> &'static str {
537 match *self {
538 Token::Keylike(_) => "an identifier",
539 Token::Equals => "an equals",
540 Token::Period => "a period",
541 Token::Comment(_) => "a comment",
542 Token::Newline => "a newline",
543 Token::Whitespace(_) => "whitespace",
544 Token::Comma => "a comma",
545 Token::RightBrace => "a right brace",
546 Token::LeftBrace => "a left brace",
547 Token::RightBracket => "a right bracket",
548 Token::LeftBracket => "a left bracket",
549 Token::String { multiline, .. } => {
550 if multiline {
551 "a multiline string"
552 } else {
553 "a string"
554 }
555 }
556 Token::Colon => "a colon",
557 Token::Plus => "a plus",
558 }
559 }
560}