sizzle_parser/
token.rs

1//! First stage tokenizer.
2//!
3//! This varies from a textbook tokenizer in that it tracks indentation in order
4//! to emit special `Indent` and `Deindent` tokens.  These are treated like
5//! other tokens used for blocks to create token tree nodes with children.
6
7use thiserror::Error;
8
9use crate::{
10    gobbler::Gobbler,
11    names::{is_valid_ident_continuing_char, is_valid_ident_initial_char, Identifier, NameError},
12    src_pos::{PosTbl, SrcPos},
13};
14
15/// Token without an empty tag value.
16pub type Token = TaggedToken<()>;
17
18/// Token tagged with a srcpos.
19pub(crate) type SrcToken = TaggedToken<SrcPos>;
20
21/// Token with a tag.
22///
23/// The tag can be used for something like a span location in the original
24/// source or to assign an identifier across structures.
25#[derive(Clone, Debug, Eq, PartialEq)]
26pub enum TaggedToken<T> {
27    // Keywords and structural elements.
28    /// `import` keyword.
29    Import(T),
30    /// `as` keyword.
31    As(T),
32    /// `class` keyword.
33    Class(T),
34    /// `:` keyword.
35    Colon(T),
36    /// `=` keyword.
37    Eq(T),
38    /// `,` keyword.
39    Comma(T),
40    /// `.` keyword.
41    Dot(T),
42    /// `\n` newline.
43    Newline(T),
44    /// `null` keyword.
45    Null(T),
46
47    // Identifiers.
48    /// An identifier.
49    Identifier(T, Identifier),
50
51    // Expressions.
52    /// An integer literal.
53    IntegerLiteral(T, u64),
54    /// `<<` operator.
55    Shl(T),
56    /// `*` operator.
57    Mul(T),
58
59    // Structural, these are treated specially in token trees later.
60    /// `[` open bracket.
61    OpenBracket(T),
62    /// `]` close bracket.
63    CloseBracket(T),
64    /// `(` open parenthesis.
65    OpenParen(T),
66    /// `)` close parenthesis.
67    CloseParen(T),
68    /// `indent` token.
69    Indent(T),
70    /// `deindent` token.
71    Deindent(T),
72
73    // Misc
74    /// Triple double quote docstring.
75    DocString(T, String),
76}
77
78impl<T> TaggedToken<T> {
79    /// Returns the tag on the token.
80    pub fn tag(&self) -> &T {
81        match self {
82            Self::Import(t) => t,
83            Self::As(t) => t,
84            Self::Class(t) => t,
85            Self::Colon(t) => t,
86            Self::Eq(t) => t,
87            Self::Comma(t) => t,
88            Self::Dot(t) => t,
89            Self::Newline(t) => t,
90            Self::Null(t) => t,
91            Self::Identifier(t, _) => t,
92            Self::IntegerLiteral(t, _) => t,
93            Self::Shl(t) => t,
94            Self::Mul(t) => t,
95            Self::OpenBracket(t) => t,
96            Self::CloseBracket(t) => t,
97            Self::OpenParen(t) => t,
98            Self::CloseParen(t) => t,
99            Self::Indent(t) => t,
100            Self::Deindent(t) => t,
101            Self::DocString(t, _) => t,
102        }
103    }
104
105    /// Converts the token to a untagged token.
106    pub fn to_untagged(&self) -> Token {
107        match self {
108            Self::Import(_) => Token::Import(()),
109            Self::As(_) => Token::As(()),
110            Self::Class(_) => Token::Class(()),
111            Self::Colon(_) => Token::Colon(()),
112            Self::Eq(_) => Token::Eq(()),
113            Self::Comma(_) => Token::Comma(()),
114            Self::Dot(_) => Token::Dot(()),
115            Self::Newline(_) => Token::Newline(()),
116            Self::Null(_) => Token::Null(()),
117            Self::Identifier(_, ident) => Token::Identifier((), ident.clone()),
118            Self::IntegerLiteral(_, v) => Token::IntegerLiteral((), *v),
119            Self::Shl(_) => Token::Shl(()),
120            Self::Mul(_) => Token::Mul(()),
121            Self::OpenBracket(_) => Token::OpenBracket(()),
122            Self::CloseBracket(_) => Token::CloseBracket(()),
123            Self::OpenParen(_) => Token::OpenParen(()),
124            Self::CloseParen(_) => Token::CloseParen(()),
125            Self::Indent(_) => Token::Indent(()),
126            Self::Deindent(_) => Token::Deindent(()),
127            Self::DocString(_, s) => Token::DocString((), s.clone()),
128        }
129    }
130}
131
132#[derive(Debug, Error)]
133pub enum TokenError {
134    #[error("unexpected char '{0}' at pos {1}")]
135    UnexpectedChar(char, usize),
136
137    #[error("unexpected end of input")]
138    UnexpectedEnd,
139
140    #[error("unexpected end of input while reading {0}")]
141    UnexpectedEndOf(&'static str),
142
143    #[error("invalid indent at {0} (was {1:?})")]
144    InvalidIndent(usize, Indent),
145
146    #[error("unrecognizable indent at {0}")]
147    UnrecognizableIndent(usize),
148
149    #[error("invalid integer '{0}'")]
150    InvalidInt(String),
151
152    #[error("invalid name: {0}")]
153    InvalidName(#[from] NameError),
154}
155
156/// Describes how we're interpreting indentation.
157#[derive(Copy, Clone, Debug, Eq, PartialEq)]
158pub enum Indent {
159    Spaces(u8),
160    Tab,
161}
162
163pub(crate) struct TokenSeqBuilder {
164    indent_ty: Option<Indent>,
165    indent_level: usize,
166    output: Vec<SrcToken>,
167}
168
169impl TokenSeqBuilder {
170    pub(crate) fn new() -> Self {
171        Self {
172            indent_ty: None,
173            indent_level: 0,
174            output: Vec::new(),
175        }
176    }
177
178    fn _indent_level(&self) -> usize {
179        self.indent_level
180    }
181
182    /// Does some really complicated case analysis to infer an indent level from a span of whitespace.
183    fn infer_indent_level(&mut self, indent: &[char], at: usize) -> Result<usize, TokenError> {
184        match self.indent_ty {
185            Some(i @ Indent::Spaces(n)) => {
186                if is_all_spaces(indent) {
187                    let found_spaces = indent.len();
188                    if found_spaces.is_multiple_of(n as usize) {
189                        let ind_cnt = found_spaces / n as usize;
190                        Ok(ind_cnt)
191                    } else {
192                        Err(TokenError::InvalidIndent(at, i))
193                    }
194                } else {
195                    Err(TokenError::InvalidIndent(at, i))
196                }
197            }
198            Some(i @ Indent::Tab) => {
199                if is_all_tabs(indent) {
200                    let found_tabs = indent.len();
201                    Ok(found_tabs)
202                } else {
203                    Err(TokenError::InvalidIndent(at, i))
204                }
205            }
206            None => {
207                let is_spaces = is_all_spaces(indent);
208                let is_tabs = is_all_tabs(indent);
209
210                // Doesn't matter what it is, this is just zero.
211                if indent.is_empty() {
212                    return Ok(0);
213                }
214
215                if is_spaces {
216                    self.indent_ty = Some(Indent::Spaces(indent.len() as u8));
217                    Ok(1)
218                } else if is_tabs {
219                    self.indent_ty = Some(Indent::Tab);
220                    Ok(indent.len())
221                } else {
222                    Err(TokenError::UnrecognizableIndent(at))
223                }
224            }
225        }
226    }
227
228    fn push_token(&mut self, t: SrcToken) {
229        self.output.push(t);
230    }
231
232    /// Updates the indentation level, producing indent tokens as necessary.
233    fn update_indent_level(&mut self, level: usize, sp: SrcPos) {
234        let diff = level as isize - self.indent_level as isize;
235        match diff {
236            0 => {}
237
238            // Deindentating.
239            d if d < 0 => {
240                for _ in 0..(-d) {
241                    self.push_token(TaggedToken::Deindent(sp));
242                }
243                self.indent_level = level;
244            }
245
246            // Indenting.
247            d if d > 0 => {
248                for _ in 0..d {
249                    self.push_token(TaggedToken::Indent(sp));
250                }
251                self.indent_level = level;
252            }
253
254            _ => unreachable!(),
255        }
256    }
257
258    /// Attempts to strip the indent prefix off of a string that's apparently a line.
259    fn strip_indent<'s>(&self, mut s: &'s str) -> Option<&'s str> {
260        match self.indent_ty {
261            Some(ind) => match ind {
262                Indent::Spaces(n) => {
263                    for _i in 0..(n as usize * self.indent_level) {
264                        s = s.strip_prefix(" ")?;
265                    }
266                    Some(s)
267                }
268                Indent::Tab => {
269                    for _i in 0..self.indent_level {
270                        s = s.strip_prefix("\t")?;
271                    }
272                    Some(s)
273                }
274            },
275
276            // If we haven't had an indent yet, then we can just assume it's 0
277            // and we don't have to do anything.
278            None => Some(s),
279        }
280    }
281
282    /// Optimistically tries to clean up the docstring.
283    fn cleanup_docstring(&self, s: &str) -> String {
284        let mut buf = String::new();
285        for l in s.lines() {
286            let ls = match self.strip_indent(l) {
287                Some(s) => s,
288                None => l,
289            };
290
291            #[allow(deprecated)]
292            let ls = ls.trim_right();
293            buf.extend(ls.chars());
294            buf.push('\n');
295        }
296
297        buf.trim().to_owned()
298    }
299
300    fn finish(mut self, sp: SrcPos) -> Result<Vec<SrcToken>, TokenError> {
301        // Automatically close the rest of the indents.
302        for _ in 0..self.indent_level {
303            self.push_token(TaggedToken::Deindent(sp));
304        }
305
306        Ok(self.output)
307    }
308}
309
310fn is_all_spaces<'c>(iter: impl IntoIterator<Item = &'c char>) -> bool {
311    iter.into_iter().all(|c| *c == ' ')
312}
313
314fn is_all_tabs<'c>(iter: impl IntoIterator<Item = &'c char>) -> bool {
315    iter.into_iter().all(|c| *c == '\t')
316}
317
318pub(crate) fn parse_char_array_to_tokens(s: &[char]) -> Result<Vec<SrcToken>, TokenError> {
319    let sp_tbl = PosTbl::generate(s.iter().copied());
320
321    let mut builder = TokenSeqBuilder::new();
322
323    let mut gob = Gobbler::new(s);
324
325    while gob.has_entry() {
326        let cur = *gob.get_expect();
327        let next = gob.get_rel(1).copied();
328        let sp = sp_tbl.expect_srcpos(gob.at());
329
330        #[cfg(test)]
331        eprintln!(
332            "considering {cur:?} (indent level {})",
333            builder._indent_level()
334        );
335
336        // Handle simple cases, comments, and whitespace.
337        match cur {
338            ' ' => {
339                gob.gobble_one();
340                continue;
341            }
342
343            // Comments are just ignoring every char until the end of the line.
344            '#' => {
345                gob.gobble_until(|c| *c == '\n');
346                continue;
347            }
348
349            // Newlines are the special case since this is where we have to figure out whitespace!
350            '\n' => {
351                builder.push_token(SrcToken::Newline(sp));
352                gob.gobble_one();
353
354                // Now i is the first one after the newline.  If this isn't the
355                // end of the buffer, let's do some work to figure out if we
356                // should do intendents.
357                if gob.has_entry() {
358                    let new_cur = gob.get_expect();
359                    let new_sp = sp_tbl.expect_srcpos(gob.at());
360                    if *new_cur == '\n' {
361                        // Just get it next time.
362                        continue;
363                    }
364
365                    let at_before = gob.at();
366                    let ws_span = gob
367                        .gobble_slice_up_to(|c| !c.is_ascii_whitespace())
368                        .expect("token: parse whitespace");
369                    let cnt = builder.infer_indent_level(ws_span, at_before)?;
370
371                    builder.update_indent_level(cnt, new_sp);
372                }
373
374                continue;
375            }
376
377            ':' => builder.push_token(SrcToken::Colon(sp)),
378            '=' => builder.push_token(SrcToken::Eq(sp)),
379            ',' => builder.push_token(SrcToken::Comma(sp)),
380            '.' => builder.push_token(SrcToken::Dot(sp)),
381            '*' => builder.push_token(SrcToken::Mul(sp)),
382            '[' => builder.push_token(SrcToken::OpenBracket(sp)),
383            ']' => builder.push_token(SrcToken::CloseBracket(sp)),
384            '(' => builder.push_token(SrcToken::OpenParen(sp)),
385            ')' => builder.push_token(SrcToken::CloseParen(sp)),
386
387            '<' => {
388                if let Some(next) = next {
389                    if next == '<' {
390                        builder.push_token(SrcToken::Shl(sp));
391                        gob.gobble_exact(2);
392                        continue;
393                    } else {
394                        return Err(TokenError::UnexpectedChar(next, gob.at() + 1));
395                    }
396                } else {
397                    return Err(TokenError::UnexpectedEnd);
398                }
399            }
400
401            // Doc comments are kinda weird.
402            '"' => {
403                let off_1 = gob.get_rel(1).copied();
404                let off_2 = gob.get_rel(2).copied();
405
406                // Match cases to make sure we recognize the token correctly.
407                match (off_1, off_2) {
408                    // This is the only correct case.
409                    (Some('"'), Some('"')) => {
410                        gob.gobble_exact(3);
411                    }
412                    _ => return Err(TokenError::UnexpectedEndOf("docstring#begin")),
413                }
414
415                // Now scan forwards until we find the matching triple double quote.
416                let Some(doc_span) = gob.gobble_slice_for_pattern(&['"', '"', '"']) else {
417                    return Err(TokenError::UnexpectedEndOf("docstring#end"));
418                };
419
420                let doc_text = doc_span.into_iter().collect::<String>();
421                gob.gobble_exact(3); // to move past the close quote
422
423                #[cfg(test)]
424                {
425                    let entry = gob.get();
426                    eprintln!("AFTER DOCSTRING {entry:?}");
427                }
428
429                let doc_cleaned_text = builder.cleanup_docstring(&doc_text);
430                builder.push_token(TaggedToken::DocString(sp, doc_cleaned_text));
431
432                continue;
433            }
434
435            c if is_valid_ident_initial_char(c) => {
436                let ident_chars =
437                    gob.gobble_slice_up_to_end(|c| !is_valid_ident_continuing_char(*c));
438
439                let ident_str = ident_chars.into_iter().collect::<String>();
440
441                // Keywords are like identifiers, but separated out.
442                if let Some(kwtok) = try_parse_keyword(&ident_str, sp) {
443                    builder.push_token(kwtok);
444                } else {
445                    let ident = Identifier::try_from(ident_str)?;
446                    builder.push_token(SrcToken::Identifier(sp, ident));
447                }
448
449                continue;
450            }
451
452            c if c.is_numeric() => {
453                let num_chars = gob.gobble_slice_up_to_end(|c| !char::is_numeric(*c));
454
455                let num_str = num_chars.iter().collect::<String>();
456                let v = num_str
457                    .parse::<u64>()
458                    .map_err(|_| TokenError::InvalidInt(num_str))?;
459                builder.push_token(SrcToken::IntegerLiteral(sp, v));
460                continue;
461            }
462
463            _ => return Err(TokenError::UnexpectedChar(cur, gob.at())),
464        }
465
466        gob.gobble_one();
467    }
468
469    builder.finish(sp_tbl.expect_end())
470}
471
472fn try_parse_keyword(s: &str, sp: SrcPos) -> Option<SrcToken> {
473    Some(match s {
474        "import" => SrcToken::Import(sp),
475        "as" => SrcToken::As(sp),
476        "class" => SrcToken::Class(sp),
477        "null" => SrcToken::Null(sp),
478        _ => return None,
479    })
480}
481
482#[cfg(test)]
483mod tests {
484    use super::{parse_char_array_to_tokens, TaggedToken, TokenSeqBuilder};
485
486    #[test]
487    fn test_whitespace_spaces() {
488        let mut builder = TokenSeqBuilder::new();
489
490        let cnt = builder
491            .infer_indent_level(&[' ', ' ', ' ', ' '], 5)
492            .unwrap();
493        assert_eq!(cnt, 1);
494
495        let cnt = builder
496            .infer_indent_level(&[' ', ' ', ' ', ' '], 5)
497            .unwrap();
498        assert_eq!(cnt, 1);
499
500        let cnt = builder
501            .infer_indent_level(&[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '], 5)
502            .unwrap();
503        assert_eq!(cnt, 2);
504
505        let _ = builder
506            .infer_indent_level(&[' ', ' ', ' ', ' ', ' ', ' ', ' '], 5)
507            .expect_err("test: should have errored");
508
509        let _ = builder
510            .infer_indent_level(&['\t'], 5)
511            .expect_err("test: should have errored");
512    }
513
514    #[test]
515    fn test_whitespace_tabs() {
516        let mut builder = TokenSeqBuilder::new();
517
518        let cnt = builder.infer_indent_level(&['\t'], 5).unwrap();
519        assert_eq!(cnt, 1);
520
521        let cnt = builder.infer_indent_level(&['\t', '\t'], 5).unwrap();
522        assert_eq!(cnt, 2);
523
524        let _ = builder
525            .infer_indent_level(&[' ', ' '], 5)
526            .expect_err("test: should have errored");
527    }
528
529    #[test]
530    fn test_parse_const() {
531        let s = "FOO_BAR = 1234";
532
533        let chars = s.chars().collect::<Vec<_>>();
534
535        let toks =
536            parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
537
538        eprintln!("{toks:#?}");
539    }
540
541    #[test]
542    fn test_parse_whitespace_consts() {
543        let s = "\nFOO = 123\n\n\nBAR = 555\nBAZ = 999";
544
545        let chars = s.chars().collect::<Vec<_>>();
546
547        let toks =
548            parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
549
550        eprintln!("{toks:#?}");
551    }
552
553    #[test]
554    fn test_parse_shl() {
555        let s = "\nFOO = 10 << 30";
556
557        let chars = s.chars().collect::<Vec<_>>();
558
559        let toks =
560            parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
561
562        eprintln!("{toks:#?}");
563    }
564
565    #[test]
566    fn test_parse_container_def() {
567        let s = "class Point(Container):\n  x_pos: int32\n  y_pos: int32\n";
568
569        let chars = s.chars().collect::<Vec<_>>();
570
571        let toks =
572            parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
573
574        eprintln!("{toks:#?}");
575    }
576
577    #[test]
578    fn test_parse_container_def_doc() {
579        let s = "class Point(Container):\n  \"\"\"2-dimensional cartesian point\"\"\"\n  x_pos: int32\n  y_pos: int32\n";
580
581        let chars = s.chars().collect::<Vec<_>>();
582
583        let toks =
584            parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
585
586        // Make sure the string we found is what we expected.
587        let mut found_doc = false;
588        for tok in &toks {
589            match tok {
590                TaggedToken::DocString(_, text) => {
591                    assert_eq!(text, "2-dimensional cartesian point");
592                    found_doc = true;
593                }
594                _ => {}
595            }
596        }
597        assert!(found_doc);
598
599        eprintln!("{toks:#?}");
600    }
601
602    #[test]
603    fn test_parse_container_def_doc_multiline() {
604        let s = "class Point(Container):\n  \"\"\"\n  2-dimensional cartesian point\n  i love mathematics\n  \"\"\"\n  x_pos: int32\n  y_pos: int32\n";
605
606        let chars = s.chars().collect::<Vec<_>>();
607
608        let toks =
609            parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
610
611        let mut found_doc = false;
612        for tok in &toks {
613            match tok {
614                TaggedToken::DocString(_, text) => {
615                    assert_eq!(text, "2-dimensional cartesian point\ni love mathematics");
616                    found_doc = true;
617                }
618                _ => {}
619            }
620        }
621        assert!(found_doc);
622
623        eprintln!("{toks:#?}");
624    }
625
626    #[test]
627    fn test_parse_container_def_comments() {
628        let s_without = "\nclass Point(Container):\n  x_pos: int32\n  y_pos: int32\n";
629        let s_with = "# comment   \nclass Point(Container):# another\n  x_pos: int32 # haha yes yes\n  y_pos: int32   # hello\n";
630
631        let chars_without = s_without.chars().collect::<Vec<_>>();
632        let chars_with = s_with.chars().collect::<Vec<_>>();
633
634        let toks_without = parse_char_array_to_tokens(&chars_without)
635            .expect("test: invoke parse_char_array_to_tokens");
636        let toks_with = parse_char_array_to_tokens(&chars_with)
637            .expect("test: invoke parse_char_array_to_tokens");
638
639        let utoks_without = toks_without
640            .iter()
641            .map(|t| t.to_untagged())
642            .collect::<Vec<_>>();
643        let utoks_with = toks_with
644            .iter()
645            .map(|t| t.to_untagged())
646            .collect::<Vec<_>>();
647
648        assert_eq!(utoks_without, utoks_with);
649
650        eprintln!("{toks_without:#?}");
651    }
652}